diff options
author | max42 <max42@yandex-team.com> | 2023-06-30 03:37:03 +0300 |
---|---|---|
committer | max42 <max42@yandex-team.com> | 2023-06-30 03:37:03 +0300 |
commit | fac2bd72b4b31ec3238292caf8fb2a8aaa6d6c4a (patch) | |
tree | b8cbc1deb00309c7f1a7ab6df520a76cf0b5c6d7 /yt/cpp/mapreduce/interface | |
parent | 7bf166b1a7ed0af927f230022b245af618e998c1 (diff) | |
download | ydb-fac2bd72b4b31ec3238292caf8fb2a8aaa6d6c4a.tar.gz |
YT-19324: move YT provider to ydb/library/yql
This commit is formed by the following script: https://paste.yandex-team.ru/6f92e4b8-efc5-4d34-948b-15ee2accd7e7/text.
This commit has zero effect on all projects that depend on YQL.
The summary of changes:
- `yql/providers/yt -> ydb/library/yql/providers/yt `- the whole implementation of YT provider is moved into YDB code base for further export as a part of YT YQL plugin shared library;
- `yql/providers/stat/{expr_nodes,uploader} -> ydb/library/yql/providers/stat/{expr_nodes,uploader}` - a small interface without implementation and the description of stat expr nodes;
- `yql/core/extract_predicate/ut -> ydb/library/yql/core/extract_predicate/ut`;
- `yql/core/{ut,ut_common} -> ydb/library/yql/core/{ut,ut_common}`;
- `yql/core` is gone;
- `yql/library/url_preprocessing -> ydb/library/yql/core/url_preprocessing`.
**NB**: all new targets inside `ydb/` are under `IF (NOT CMAKE_EXPORT)` clause which disables them from open-source cmake generation and ya make build. They will be enabled in the subsequent commits.
Diffstat (limited to 'yt/cpp/mapreduce/interface')
66 files changed, 21022 insertions, 0 deletions
diff --git a/yt/cpp/mapreduce/interface/batch_request.cpp b/yt/cpp/mapreduce/interface/batch_request.cpp new file mode 100644 index 0000000000..fefdacb61a --- /dev/null +++ b/yt/cpp/mapreduce/interface/batch_request.cpp @@ -0,0 +1,15 @@ +#include "batch_request.h" +#include "client.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +IBatchRequestBase& IBatchRequest::WithTransaction(const ITransactionPtr& transaction) +{ + return WithTransaction(transaction->GetId()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/batch_request.h b/yt/cpp/mapreduce/interface/batch_request.h new file mode 100644 index 0000000000..3ea28f76fd --- /dev/null +++ b/yt/cpp/mapreduce/interface/batch_request.h @@ -0,0 +1,222 @@ +#pragma once + +#include "fwd.h" + +#include "client_method_options.h" + +#include <library/cpp/threading/future/future.h> +#include <util/generic/ptr.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +/// Helper base of @ref NYT::IBatchRequest holding most of useful methods. +class IBatchRequestBase + : public TThrRefBase +{ +public: + virtual ~IBatchRequestBase() = default; + + /// + /// @brief Create cypress node. + /// + /// @see NYT::ICypressClient::Create + virtual ::NThreading::TFuture<TNodeId> Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options = TCreateOptions()) = 0; + + /// + /// @brief Remove cypress node. + /// + /// @see NYT::ICypressClient::Remove + virtual ::NThreading::TFuture<void> Remove( + const TYPath& path, + const TRemoveOptions& options = TRemoveOptions()) = 0; + + /// + /// @brief Check wether cypress node exists. + /// + /// @see NYT::ICypressClient::Exists + virtual ::NThreading::TFuture<bool> Exists( + const TYPath& path, + const TExistsOptions& options = TExistsOptions()) = 0; + + /// + /// @brief Get cypress node. + /// + /// @see NYT::ICypressClient::Get + virtual ::NThreading::TFuture<TNode> Get( + const TYPath& path, + const TGetOptions& options = TGetOptions()) = 0; + + /// + /// @brief Set cypress node. + /// + /// @see NYT::ICypressClient::Set + virtual ::NThreading::TFuture<void> Set( + const TYPath& path, + const TNode& node, + const TSetOptions& options = TSetOptions()) = 0; + + /// + /// @brief List cypress directory. + /// + /// @see NYT::ICypressClient::List + virtual ::NThreading::TFuture<TNode::TListType> List( + const TYPath& path, + const TListOptions& options = TListOptions()) = 0; + + /// + /// @brief Copy cypress node. + /// + /// @see NYT::ICypressClient::Copy + virtual ::NThreading::TFuture<TNodeId> Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options = TCopyOptions()) = 0; + + /// + /// @brief Move cypress node. + /// + /// @see NYT::ICypressClient::Move + virtual ::NThreading::TFuture<TNodeId> Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options = TMoveOptions()) = 0; + + /// + /// @brief Create symbolic link. + /// + /// @see NYT::ICypressClient::Link. + virtual ::NThreading::TFuture<TNodeId> Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options = TLinkOptions()) = 0; + + /// + /// @brief Lock cypress node. + /// + /// @see NYT::ICypressClient::Lock + virtual ::NThreading::TFuture<ILockPtr> Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options = TLockOptions()) = 0; + + /// + /// @brief Unlock cypress node. + /// + /// @see NYT::ICypressClient::Unlock + virtual ::NThreading::TFuture<void> Unlock( + const TYPath& path, + const TUnlockOptions& options = TUnlockOptions()) = 0; + + /// + /// @brief Abort operation. + /// + /// @see NYT::IClient::AbortOperation + virtual ::NThreading::TFuture<void> AbortOperation(const TOperationId& operationId) = 0; + + /// + /// @brief Force complete operation. + /// + /// @see NYT::IClient::CompleteOperation + virtual ::NThreading::TFuture<void> CompleteOperation(const TOperationId& operationId) = 0; + + /// + /// @brief Suspend operation. + /// + /// @see NYT::IClient::SuspendOperation + virtual ::NThreading::TFuture<void> SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0; + + /// + /// @brief Resume operation. + /// + /// @see NYT::IClient::ResumeOperation + virtual ::NThreading::TFuture<void> ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options = TResumeOperationOptions()) = 0; + + /// + /// @brief Update parameters of running operation. + /// + /// @see NYT::IClient::UpdateOperationParameters + virtual ::NThreading::TFuture<void> UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options = TUpdateOperationParametersOptions()) = 0; + + /// + /// @brief Canonize cypress path + /// + /// @see NYT::ICypressClient::CanonizeYPath + virtual ::NThreading::TFuture<TRichYPath> CanonizeYPath(const TRichYPath& path) = 0; + + /// + /// @brief Get table columnar statistic + /// + /// @see NYT::ICypressClient::GetTableColumnarStatistics + virtual ::NThreading::TFuture<TVector<TTableColumnarStatistics>> GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options = {}) = 0; + + /// + /// @brief Check permission for given path. + /// + /// @see NYT::IClient::CheckPermission + virtual ::NThreading::TFuture<TCheckPermissionResponse> CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options = TCheckPermissionOptions()) = 0; +}; + +/// +/// @brief Batch request object. +/// +/// Allows to send multiple lightweight requests at once significantly +/// reducing time of their execution. +/// +/// Methods of this class accept same arguments as @ref NYT::IClient methods but +/// return TFuture that is set after execution of @ref NYT::IBatchRequest::ExecuteBatch +/// +/// @see [Example of usage](https://a.yandex-team.ru/arc/trunk/arcadia/yt/cpp/mapreduce/examples/tutorial/batch_request/main.cpp) +class IBatchRequest + : public IBatchRequestBase +{ +public: + /// + /// @brief Temporary override current transaction. + /// + /// Using WithTransaction user can temporary override default transaction. + /// Example of usage: + /// TBatchRequest batchRequest; + /// auto noTxResult = batchRequest.Get("//some/path"); + /// auto txResult = batchRequest.WithTransaction(tx).Get("//some/path"); + virtual IBatchRequestBase& WithTransaction(const TTransactionId& transactionId) = 0; + IBatchRequestBase& WithTransaction(const ITransactionPtr& transaction); + + /// + /// @brief Executes all subrequests of batch request. + /// + /// After execution of this method all TFuture objects returned by subrequests will + /// be filled with either result or error. + /// + /// @note It is undefined in which order these requests are executed. + /// + /// @note This method doesn't throw if subrequest emits error. + /// Instead corresponding future is set with exception. + /// So it is always important to check TFuture status. + /// + /// Single TBatchRequest instance may be executed only once + /// and cannot be modified (filled with additional requests) after execution. + /// Exception is thrown on attempt to modify executed batch request + /// or execute it again. + virtual void ExecuteBatch(const TExecuteBatchOptions& options = TExecuteBatchOptions()) = 0; +}; + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client.cpp b/yt/cpp/mapreduce/interface/client.cpp new file mode 100644 index 0000000000..11d308b809 --- /dev/null +++ b/yt/cpp/mapreduce/interface/client.cpp @@ -0,0 +1,19 @@ +#include "client.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +void ILock::Wait(TDuration timeout) +{ + return GetAcquiredFuture().GetValue(timeout); +} + +void ITransaction::Detach() +{ + Y_FAIL("ITransaction::Detach() is not implemented"); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client.h b/yt/cpp/mapreduce/interface/client.h new file mode 100644 index 0000000000..54f37c3ae0 --- /dev/null +++ b/yt/cpp/mapreduce/interface/client.h @@ -0,0 +1,568 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/client.h +/// +/// Main header of the C++ YT Wrapper. + +/// +/// @mainpage C++ library for working with YT +/// +/// This library provides possibilities to work with YT as a [MapReduce](https://en.wikipedia.org/wiki/MapReduce) system. It allows: +/// - to read/write tables and files +/// - to run operations +/// - to work with transactions. +/// +/// This library provides only basic functions for working with dynamic tables. +/// To access full powers of YT dynamic tables one should use +/// [yt/client](https://a.yandex-team.ru/arc/trunk/arcadia/yt/19_4/yt/client) library. +/// +/// Entry points to this library: +/// - @ref NYT::Initialize() initialization function for this library; +/// - @ref NYT::IClient main interface to work with YT cluster; +/// - @ref NYT::CreateClient() function that creates client for particular cluster; +/// - @ref NYT::IOperationClient ancestor of @ref NYT::IClient containing the set of methods to run operations. +/// +/// Tutorial on how to use this library can be found [here](https://yt.yandex-team.ru/docs/api/c++/examples). + +#include "fwd.h" + +#include "client_method_options.h" +#include "constants.h" +#include "batch_request.h" +#include "cypress.h" +#include "init.h" +#include "io.h" +#include "node.h" +#include "operation.h" + +#include <library/cpp/threading/future/future.h> + +#include <util/datetime/base.h> +#include <util/generic/maybe.h> +#include <util/system/compiler.h> + +/// Main namespace of YT client +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// OAuth info (returned by @ref NYT::IClient::WhoAmI). +struct TAuthorizationInfo +{ + /// User's login. + TString Login; + + /// Realm. + TString Realm; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Part of @ref NYT::TCheckPermissionResponse. +/// +/// In case when 'Action == ESecurityAction::Deny' because of a 'deny' rule, +/// the "denying" object name and id and "denied" subject name an id may be returned. +struct TCheckPermissionResult +{ + /// Was the access granted or not. + ESecurityAction Action; + + /// Id of the object whose ACL's "deny" rule forbids the access. + TMaybe<TGUID> ObjectId; + + /// + /// @brief Name of the object whose ACL's "deny" rule forbids the access. + /// + /// Example is "node //tmp/x/y". + TMaybe<TString> ObjectName; + + /// Id of the subject for whom the access was denied by a "deny" rule. + TMaybe<TGUID> SubjectId; + + /// Name of the subject for whom the access was denied by a "deny" rule. + TMaybe<TString> SubjectName; +}; + +/// @brief Result of @ref NYT::IClient::CheckPermission command. +/// +/// The base part of the response corresponds to the check result for the node itself. +/// `Columns` vector contains check results for the columns (in the same order as in the request). +struct TCheckPermissionResponse + : public TCheckPermissionResult +{ + /// @brief Results for the table columns access permissions. + /// + /// @see [Columnar ACL doc](https://yt.yandex-team.ru/docs/description/common/columnar_acl) + TVector<TCheckPermissionResult> Columns; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Interface representing a lock obtained from @ref NYT::ITransaction::Lock. +/// +/// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#start-tx) +class ILock + : public TThrRefBase +{ +public: + virtual ~ILock() = default; + + /// Get cypress node id of lock itself. + virtual const TLockId& GetId() const = 0; + + /// Get cypress node id of locked object. + virtual TNodeId GetLockedNodeId() const = 0; + + /// + /// @brief Get future that will be set once lock is in "acquired" state. + /// + /// Note that future might contain exception if some error occurred + /// e.g. lock transaction was aborted. + virtual const ::NThreading::TFuture<void>& GetAcquiredFuture() const = 0; + + /// + /// @brief Wait until lock is in "acquired" state. + /// + /// Throws exception if timeout exceeded or some error occurred + /// e.g. lock transaction was aborted. + void Wait(TDuration timeout = TDuration::Max()); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Base class for @ref NYT::IClient and @ref NYT::ITransaction. +/// +/// This class contains transactional commands. +class IClientBase + : public TThrRefBase + , public ICypressClient + , public IIOClient + , public IOperationClient +{ +public: + /// + /// @brief Start a [transaction] (https://yt.yandex-team.ru/docs/description/storage/transactions.html#master_transactions). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#start-tx) + [[nodiscard]] virtual ITransactionPtr StartTransaction( + const TStartTransactionOptions& options = TStartTransactionOptions()) = 0; + + /// + /// @brief Change properties of table. + /// + /// Allows to: + /// - switch table between dynamic/static mode + /// - or change table schema + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#alter-table) + virtual void AlterTable( + const TYPath& path, + const TAlterTableOptions& options = TAlterTableOptions()) = 0; + + /// + /// @brief Create batch request object that allows to execute several light requests in parallel. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#execute-batch) + virtual TBatchRequestPtr CreateBatchRequest() = 0; + + /// @brief Get root client outside of all transactions. + virtual IClientPtr GetParentClient() = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +/// @brief Interface representing a master transaction. +/// +/// @see [YT doc](https://yt.yandex-team.ru/docs/description/storage/transactions.html#master_transactions) +class ITransaction + : virtual public IClientBase +{ +public: + /// Get id of transaction. + virtual const TTransactionId& GetId() const = 0; + + /// + /// @brief Try to lock given path. + /// + /// Lock will be held until transaction is commited/aborted or @ref NYT::ITransaction::Unlock method is called. + /// Lock modes: + /// - `LM_EXCLUSIVE`: if exclusive lock is taken no other transaction can take exclusive or shared lock. + /// - `LM_SHARED`: if shared lock is taken other transactions can take shared lock but not exclusive. + /// - `LM_SNAPSHOT`: snapshot lock always succeeds, when snapshot lock is taken current transaction snapshots object. + /// It will not see changes that occurred to it in other transactions. + /// + /// Exclusive/shared lock can be waitable or not. + /// If nonwaitable lock cannot be taken exception is thrown. + /// If waitable lock cannot be taken it is created in pending state and client can wait until it actually taken. + /// Check @ref NYT::TLockOptions::Waitable and @ref NYT::ILock::GetAcquiredFuture for more details. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#lock) + virtual ILockPtr Lock( + const TYPath& path, + ELockMode mode, + const TLockOptions& options = TLockOptions()) = 0; + + /// + /// @brief Remove all the locks (including pending ones) for this transaction from a Cypress node at `path`. + /// + /// If the locked version of the node differs from the original one, + /// an error will be thrown. + /// + /// Command is successful even if the node has no locks. + /// Only explicit (created by @ref NYT::ITransaction::Lock) locks are removed. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unlock) + virtual void Unlock( + const TYPath& path, + const TUnlockOptions& options = TUnlockOptions()) = 0; + + /// + /// @brief Commit transaction. + /// + /// All changes that are made by transactions become visible globally or to parent transaction. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#commit) + virtual void Commit() = 0; + + /// + /// @brief Abort transaction. + /// + /// All changes made by current transaction are lost. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#abort) + virtual void Abort() = 0; + + /// @brief Explicitly ping transaction. + /// + /// User usually does not need this method (as transactions are pinged automatically, + /// see @ref NYT::TStartTransactionOptions::AutoPingable). + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#ping) + virtual void Ping() = 0; + + /// + /// @brief Detach transaction. + /// + /// Stop any activities connected with it: pinging, aborting on crashes etc. + /// Forget about the transaction totally. + virtual void Detach(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Interface containing non-transactional commands. +class IClient + : virtual public IClientBase +{ +public: + /// + /// @brief Attach to existing master transaction. + /// + /// Returned object WILL NOT: + /// - ping transaction automatically (unless @ref NYT::TAttachTransactionOptions::AutoPing is set) + /// - abort it on program termination (unless @ref NYT::TAttachTransactionOptions::AbortOnTermination is set). + /// Otherwise returned object is similar to the object returned by @ref NYT::IClientBase::StartTransaction. + /// and it can see all the changes made inside the transaction. + [[nodiscard]] virtual ITransactionPtr AttachTransaction( + const TTransactionId& transactionId, + const TAttachTransactionOptions& options = TAttachTransactionOptions()) = 0; + + /// + /// @brief Mount dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#mount-table) + virtual void MountTable( + const TYPath& path, + const TMountTableOptions& options = TMountTableOptions()) = 0; + + /// + /// @brief Unmount dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unmount-table) + virtual void UnmountTable( + const TYPath& path, + const TUnmountTableOptions& options = TUnmountTableOptions()) = 0; + + /// + /// @brief Remount dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#remount-table) + virtual void RemountTable( + const TYPath& path, + const TRemountTableOptions& options = TRemountTableOptions()) = 0; + + /// + /// @brief Switch dynamic table from `mounted' into `frozen' state. + /// + /// When table is in frozen state all its data is flushed to disk and writes are disabled. + /// + /// @note this function launches the process of switching, but doesn't wait until switching is accomplished. + /// Waiting has to be performed by user. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#freeze-table) + virtual void FreezeTable( + const TYPath& path, + const TFreezeTableOptions& options = TFreezeTableOptions()) = 0; + + /// + /// @brief Switch dynamic table from `frozen` into `mounted` state. + /// + /// @note this function launches the process of switching, but doesn't wait until switching is accomplished. + /// Waiting has to be performed by user. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#unfreeze-table) + virtual void UnfreezeTable( + const TYPath& path, + const TUnfreezeTableOptions& options = TUnfreezeTableOptions()) = 0; + + /// + /// @brief Reshard dynamic table (break it into tablets) by given pivot keys. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#reshard-table) + virtual void ReshardTable( + const TYPath& path, + const TVector<TKey>& pivotKeys, + const TReshardTableOptions& options = TReshardTableOptions()) = 0; + + /// + /// @brief Reshard dynamic table, breaking it into given number of tablets. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#reshard-table) + virtual void ReshardTable( + const TYPath& path, + i64 tabletCount, + const TReshardTableOptions& options = TReshardTableOptions()) = 0; + + /// + /// @brief Insert rows into dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#insert-rows) + virtual void InsertRows( + const TYPath& path, + const TNode::TListType& rows, + const TInsertRowsOptions& options = TInsertRowsOptions()) = 0; + + /// + /// @brief Delete rows from dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#delete-rows) + virtual void DeleteRows( + const TYPath& path, + const TNode::TListType& keys, + const TDeleteRowsOptions& options = TDeleteRowsOptions()) = 0; + + /// + /// @brief Trim rows from the beginning of ordered dynamic table. + /// + /// Asynchronously removes `rowCount` rows from the beginning of ordered dynamic table. + /// Numeration of remaining rows *does not change*, e.g. after `trim(10)` and `trim(20)` + /// you get in total `20` deleted rows. + /// + /// @param path Path to ordered dynamic table. + /// @param tabletIndex Which tablet to trim. + /// @param rowCount How many trimmed rows will be in the table after command. + /// @param options Optional parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#trim-rows) + virtual void TrimRows( + const TYPath& path, + i64 tabletIndex, + i64 rowCount, + const TTrimRowsOptions& options = TTrimRowsOptions()) = 0; + + /// + /// @brief Lookup rows with given keys from dynamic table. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#lookup-rows) + virtual TNode::TListType LookupRows( + const TYPath& path, + const TNode::TListType& keys, + const TLookupRowsOptions& options = TLookupRowsOptions()) = 0; + + /// + /// @brief Select rows from dynamic table, using [SQL dialect](https://yt.yandex-team.ru/docs//description/dynamic_tables/dyn_query_language.html). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#select-rows) + virtual TNode::TListType SelectRows( + const TString& query, + const TSelectRowsOptions& options = TSelectRowsOptions()) = 0; + + /// + /// @brief Change properties of table replica. + /// + /// Allows to enable/disable replica and/or change its mode. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#alter-table-replica) + virtual void AlterTableReplica( + const TReplicaId& replicaId, + const TAlterTableReplicaOptions& alterTableReplicaOptions) = 0; + + /// + /// @brief Generate a monotonously increasing master timestamp. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#generate-timestamp) + virtual ui64 GenerateTimestamp() = 0; + + /// Return YT username of current client. + virtual TAuthorizationInfo WhoAmI() = 0; + + /// + /// @brief Get operation attributes. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-operation) + virtual TOperationAttributes GetOperation( + const TOperationId& operationId, + const TGetOperationOptions& options = TGetOperationOptions()) = 0; + + /// + /// @brief List operations satisfying given filters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list-operations) + virtual TListOperationsResult ListOperations( + const TListOperationsOptions& options = TListOperationsOptions()) = 0; + + /// + /// @brief Update operation runtime parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#update-op-parameters) + virtual void UpdateOperationParameters( + const TOperationId& operationId, + const TUpdateOperationParametersOptions& options) = 0; + + /// + /// @brief Get job attributes. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job) + virtual TJobAttributes GetJob( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobOptions& options = TGetJobOptions()) = 0; + + /// + /// List attributes of jobs satisfying given filters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list-jobs) + virtual TListJobsResult ListJobs( + const TOperationId& operationId, + const TListJobsOptions& options = TListJobsOptions()) = 0; + + /// + /// @brief Get the input of a running or failed job. + /// + /// @ref NYT::TErrorResponse exception is thrown if job is missing. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-input) + virtual IFileReaderPtr GetJobInput( + const TJobId& jobId, + const TGetJobInputOptions& options = TGetJobInputOptions()) = 0; + + /// + /// @brief Get fail context of a failed job. + /// + /// @ref NYT::TErrorResponse exception is thrown if it is missing. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-fail-context) + virtual IFileReaderPtr GetJobFailContext( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobFailContextOptions& options = TGetJobFailContextOptions()) = 0; + + /// + /// @brief Get stderr of a running or failed job. + /// + /// @ref NYT::TErrorResponse exception is thrown if it is missing. + /// + /// @note YT doesn't store all job stderrs + /// + /// @note If job stderr exceeds few megabytes YT will store only head and tail of stderr. + /// + /// @see Description of `max_stderr_size` spec option [here](https://yt.yandex-team.ru/docs//description/mr/operations_options.html). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-job-stderr) + virtual IFileReaderPtr GetJobStderr( + const TOperationId& operationId, + const TJobId& jobId, + const TGetJobStderrOptions& options = TGetJobStderrOptions()) = 0; + + /// + /// @brief Create one or several rbtorrents for files in a blob table. + /// + /// If specified, one torrent is created for each value of `KeyColumns` option. + /// Otherwise, a single torrent with all files of a table is created. + /// + /// @return list of nodes, each node has two fields + /// * `key`: list of key columns values. Empty if `KeyColumns` is not specified. + /// * `rbtorrent`: rbtorrent string (with `rbtorrent:` prefix) + /// + /// @see [More info.](https://docs.yandex-team.ru/docs/yt/description/storage/blobtables#sky_share) + virtual TNode::TListType SkyShareTable( + const std::vector<TYPath>& tablePaths, + const TSkyShareTableOptions& options) = 0; + + /// + /// @brief Check if `user` has `permission` to access a Cypress node at `path`. + /// + /// For tables access to columns specified in `options.Columns_` can be checked + /// (@see [the doc](https://yt.yandex-team.ru/docs/description/common/columnar_acl)). + /// + /// If access is denied (the returned result has `.Action == ESecurityAction::Deny`) + /// because of a `deny` rule, the "denying" object name and id + /// and "denied" subject name an id may be returned. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#check_permission) + virtual TCheckPermissionResponse CheckPermission( + const TString& user, + EPermission permission, + const TYPath& path, + const TCheckPermissionOptions& options = TCheckPermissionOptions()) = 0; + + /// @brief Get information about tablet + /// @see NYT::TTabletInfo + virtual TVector<TTabletInfo> GetTabletInfos( + const TYPath& path, + const TVector<int>& tabletIndexes, + const TGetTabletInfosOptions& options = TGetTabletInfosOptions()) = 0; + + /// + /// @brief Suspend operation. + /// + /// Jobs will be aborted. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#suspend_op) + virtual void SuspendOperation( + const TOperationId& operationId, + const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0; + + /// @brief Resume previously suspended operation. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#resume_op) + virtual void ResumeOperation( + const TOperationId& operationId, + const TResumeOperationOptions& options = TResumeOperationOptions()) = 0; + + /// + /// @brief Synchronously terminates all client's background activities + /// + /// e.g. no callbacks will be executed after the function is completed + /// + /// @note It is safe to call Shutdown multiple times + /// + /// @note @ref NYT::TApiUsageError will be thrown if any client's method is called after shutdown + /// + virtual void Shutdown() = 0; +}; + + +/// Create a client for particular MapReduce cluster. +IClientPtr CreateClient( + const TString& serverName, + const TCreateClientOptions& options = TCreateClientOptions()); + + +/// Create a client for mapreduce cluster specified in `YT_PROXY` environment variable. +IClientPtr CreateClientFromEnv( + const TCreateClientOptions& options = TCreateClientOptions()); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client_method_options.cpp b/yt/cpp/mapreduce/interface/client_method_options.cpp new file mode 100644 index 0000000000..66f72bfe5f --- /dev/null +++ b/yt/cpp/mapreduce/interface/client_method_options.cpp @@ -0,0 +1,34 @@ +#include "client_method_options.h" + +#include "tvm.h" + +namespace NYT { + +template <typename T> +static void MergeMaybe(TMaybe<T>& origin, const TMaybe<T>& patch) +{ + if (patch) { + origin = patch; + } +} + +void TFormatHints::Merge(const TFormatHints& patch) +{ + if (patch.SkipNullValuesForTNode_) { + SkipNullValuesForTNode(true); + } + MergeMaybe(EnableStringToAllConversion_, patch.EnableStringToAllConversion_); + MergeMaybe(EnableAllToStringConversion_, patch.EnableAllToStringConversion_); + MergeMaybe(EnableIntegralTypeConversion_, patch.EnableIntegralTypeConversion_); + MergeMaybe(EnableIntegralToDoubleConversion_, patch.EnableIntegralToDoubleConversion_); + MergeMaybe(EnableTypeConversion_, patch.EnableTypeConversion_); + MergeMaybe(ComplexTypeMode_, patch.ComplexTypeMode_); +} + +TCreateClientOptions& TCreateClientOptions::ServiceTicketAuth(const NAuth::IServiceTicketAuthPtrWrapper& wrapper) +{ + ServiceTicketAuth_ = std::make_shared<NAuth::IServiceTicketAuthPtrWrapper>(wrapper); + return *this; +} + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/client_method_options.h b/yt/cpp/mapreduce/interface/client_method_options.h new file mode 100644 index 0000000000..8074632353 --- /dev/null +++ b/yt/cpp/mapreduce/interface/client_method_options.h @@ -0,0 +1,1452 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/client_method_options.h +/// +/// Header containing options for @ref NYT::IClient methods. + +#include "common.h" +#include "config.h" +#include "format.h" +#include "public.h" +#include "retry_policy.h" + +#include <util/datetime/base.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Type of the cypress node. +enum ENodeType : int +{ + NT_STRING /* "string_node" */, + NT_INT64 /* "int64_node" */, + NT_UINT64 /* "uint64_node" */, + NT_DOUBLE /* "double_node" */, + NT_BOOLEAN /* "boolean_node" */, + NT_MAP /* "map_node" */, + NT_LIST /* "list_node" */, + NT_FILE /* "file" */, + NT_TABLE /* "table" */, + NT_DOCUMENT /* "document" */, + NT_REPLICATED_TABLE /* "replicated_table" */, + NT_TABLE_REPLICA /* "table_replica" */, + NT_USER /* "user" */, + NT_SCHEDULER_POOL /* "scheduler_pool" */, + NT_LINK /* "link" */, +}; + +/// +/// @brief Mode of composite type representation in yson. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/data_types#yson +enum class EComplexTypeMode : int +{ + Named /* "named" */, + Positional /* "positional" */, +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Create +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#create +struct TCreateOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCreateOptions; + /// @endcond + + /// Create missing parent directories if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// + /// @brief Do not raise error if node already exists. + /// + /// Node is not recreated. + /// Force and IgnoreExisting MUST NOT be used simultaneously. + FLUENT_FIELD_DEFAULT(bool, IgnoreExisting, false); + + /// + /// @brief Recreate node if it exists. + /// + /// Force and IgnoreExisting MUST NOT be used simultaneously. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// @brief Set node attributes. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Remove +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#remove +struct TRemoveOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TRemoveOptions; + /// @endcond + + /// + /// @brief Remove whole tree when removing composite cypress node (e.g. `map_node`). + /// + /// Without this option removing nonempty composite node will fail. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// @brief Do not fail if removing node doesn't exist. + FLUENT_FIELD_DEFAULT(bool, Force, false); +}; + +/// Base class for options for operations that read from master. +template <typename TDerived> +struct TMasterReadOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Where to read from. + FLUENT_FIELD_OPTION(EMasterReadKind, ReadFrom); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Exists +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#exists +struct TExistsOptions + : public TMasterReadOptions<TExistsOptions> +{ +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Get +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get +struct TGetOptions + : public TMasterReadOptions<TGetOptions> +{ + /// @brief Attributes that should be fetched with each node. + FLUENT_FIELD_OPTION(TAttributeFilter, AttributeFilter); + + /// @brief Limit for the number of children node. + FLUENT_FIELD_OPTION(i64, MaxSize); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Set +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#set +struct TSetOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSetOptions; + /// @endcond + + /// Create missing parent directories if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Allow setting any nodes, not only attribute and document ones. + FLUENT_FIELD_OPTION(bool, Force); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::MultisetAttributes +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#multiset_attributes +struct TMultisetAttributesOptions +{ }; + +/// +/// @brief Options for @ref NYT::ICypressClient::List +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#list +struct TListOptions + : public TMasterReadOptions<TListOptions> +{ + /// @cond Doxygen_Suppress + using TSelf = TListOptions; + /// @endcond + + /// Attributes that should be fetched for each node. + FLUENT_FIELD_OPTION(TAttributeFilter, AttributeFilter); + + /// Limit for the number of children that will be fetched. + FLUENT_FIELD_OPTION(i64, MaxSize); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Copy +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#copy +struct TCopyOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCopyOptions; + /// @endcond + + /// Create missing directories in destination path if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Allows to use existing node as destination, it will be overwritten. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// Whether to preserves account of source node. + FLUENT_FIELD_DEFAULT(bool, PreserveAccount, false); + + /// Whether to preserve `expiration_time` attribute of source node. + FLUENT_FIELD_OPTION(bool, PreserveExpirationTime); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Move +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#move +struct TMoveOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TMoveOptions; + /// @endcond + + /// Create missing directories in destination path if required. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Allows to use existing node as destination, it will be overwritten. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// Whether to preserves account of source node. + FLUENT_FIELD_DEFAULT(bool, PreserveAccount, false); + + /// Whether to preserve `expiration_time` attribute of source node. + FLUENT_FIELD_OPTION(bool, PreserveExpirationTime); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Link +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#link +struct TLinkOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TLinkOptions; + /// @endcond + + /// Create parent directories of destination if they don't exist. + FLUENT_FIELD_DEFAULT(bool, Recursive, false); + + /// Do not raise error if link already exists. + FLUENT_FIELD_DEFAULT(bool, IgnoreExisting, false); + + /// Force rewrite target node. + FLUENT_FIELD_DEFAULT(bool, Force, false); + + /// Attributes of created link. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// +/// @brief Options for @ref NYT::ICypressClient::Concatenate +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#concatenate +struct TConcatenateOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TConcatenateOptions; + /// @endcond + + /// Whether we should append to destination or rewrite it. + FLUENT_FIELD_OPTION(bool, Append); +}; + +/// +/// @brief Options for @ref NYT::IIOClient::CreateBlobTableReader +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#read_blob_table +struct TBlobTableReaderOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TBlobTableReaderOptions; + /// @endcond + + /// Name of the part index column. By default it is "part_index". + FLUENT_FIELD_OPTION(TString, PartIndexColumnName); + + /// Name of the data column. By default it is "data". + FLUENT_FIELD_OPTION(TString, DataColumnName); + + /// + /// @brief Size of each part. + /// + /// All blob parts except the last part of the blob must be of this size + /// otherwise blob table reader emits error. + FLUENT_FIELD_DEFAULT(ui64, PartSize, 4 * 1024 * 1024); + + /// @brief Offset from which to start reading + FLUENT_FIELD_DEFAULT(i64, Offset, 0); +}; + +/// +/// @brief Resource limits for operation (or pool) +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/scheduler/scheduler_and_pools#resursy +/// @see NYT::TUpdateOperationParametersOptions +struct TResourceLimits +{ + /// @cond Doxygen_Suppress + using TSelf = TResourceLimits; + /// @endcond + + /// Number of slots for user jobs. + FLUENT_FIELD_OPTION(i64, UserSlots); + + /// Number of cpu cores. + FLUENT_FIELD_OPTION(double, Cpu); + + /// Network usage. Doesn't have precise physical unit. + FLUENT_FIELD_OPTION(i64, Network); + + /// Memory in bytes. + FLUENT_FIELD_OPTION(i64, Memory); +}; + +/// +/// @brief Scheduling options for single pool tree. +/// +/// @see NYT::TUpdateOperationParametersOptions +struct TSchedulingOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSchedulingOptions; + /// @endcond + + /// + /// @brief Pool to switch operation to. + /// + /// @note Switching is currently disabled on the server (will induce an exception). + FLUENT_FIELD_OPTION(TString, Pool); + + /// @brief Operation weight. + FLUENT_FIELD_OPTION(double, Weight); + + /// @brief Operation resource limits. + FLUENT_FIELD_OPTION(TResourceLimits, ResourceLimits); +}; + +/// +/// @brief Collection of scheduling options for multiple pool trees. +/// +/// @see NYT::TUpdateOperationParametersOptions +struct TSchedulingOptionsPerPoolTree +{ + /// @cond Doxygen_Suppress + using TSelf = TSchedulingOptionsPerPoolTree; + /// @endcond + + TSchedulingOptionsPerPoolTree(const THashMap<TString, TSchedulingOptions>& options = {}) + : Options_(options) + { } + + /// Add scheduling options for pool tree. + TSelf& Add(TStringBuf poolTreeName, const TSchedulingOptions& schedulingOptions) + { + Y_ENSURE(Options_.emplace(poolTreeName, schedulingOptions).second); + return *this; + } + + THashMap<TString, TSchedulingOptions> Options_; +}; + +/// +/// @brief Options for @ref NYT::IOperation::SuspendOperation +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#suspend_op +struct TSuspendOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSuspendOperationOptions; + /// @endcond + + /// + /// @brief Whether to abort already running jobs. + /// + /// By default running jobs are not aborted. + FLUENT_FIELD_OPTION(bool, AbortRunningJobs); +}; + +/// +/// @brief Options for @ref NYT::IOperation::ResumeOperation +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#resume_op +struct TResumeOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TResumeOperationOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IOperation::UpdateParameters +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#update_op_parameters +struct TUpdateOperationParametersOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TUpdateOperationParametersOptions; + /// @endcond + + /// New owners of the operation. + FLUENT_VECTOR_FIELD(TString, Owner); + + /// Pool to switch operation to (for all pool trees it is running in). + FLUENT_FIELD_OPTION(TString, Pool); + + /// New operation weight (for all pool trees it is running in). + FLUENT_FIELD_OPTION(double, Weight); + + /// Scheduling options for each pool tree the operation is running in. + FLUENT_FIELD_OPTION(TSchedulingOptionsPerPoolTree, SchedulingOptionsPerPoolTree); +}; + +/// +/// @brief Base class for many options related to IO. +/// +/// @ref NYT::TFileWriterOptions +/// @ref NYT::TFileReaderOptions +/// @ref NYT::TTableReaderOptions +/// @ref NYT::TTableWriterOptions +template <class TDerived> +struct TIOOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Advanced options for reader/writer. + /// + /// Readers/writers have many options not of all of them are supported by library. + /// If you need such unsupported option, you might use `Config` option until + /// option is supported. + /// + /// Example: + /// + /// TTableWriterOptions().Config(TNode()("max_row_weight", 64 << 20))) + /// + /// @note We encourage you to ask yt@ to add native C++ support of required options + /// and use `Config` only as temporary solution while native support is not ready. + FLUENT_FIELD_OPTION(TNode, Config); + + /// + /// @brief Whether to create internal client transaction for reading / writing table. + /// + /// This is advanced option. + /// + /// If `CreateTransaction` is set to `false` reader/writer doesn't create internal transaction + /// and doesn't lock table. This option is overriden (effectively `false`) for writers by + /// @ref NYT::TTableWriterOptions::SingleHttpRequest + /// + /// WARNING: if `CreateTransaction` is `false`, read/write might become non-atomic. + /// Change ONLY if you are sure what you are doing! + FLUENT_FIELD_DEFAULT(bool, CreateTransaction, true); +}; + +/// @brief Options for reading file from YT. +struct TFileReaderOptions + : public TIOOptions<TFileReaderOptions> +{ + /// + /// @brief Offset to start reading from. + /// + /// By default reading is started from the beginning of the file. + FLUENT_FIELD_OPTION(i64, Offset); + + /// + /// @brief Maximum length to read. + /// + /// By default file is read until the end. + FLUENT_FIELD_OPTION(i64, Length); +}; + +/// @brief Options that control how server side of YT stores data. +struct TWriterOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TWriterOptions; + /// @endcond + + /// + /// @brief Whether to wait all replicas to be written. + /// + /// When set to true upload will be considered successful as soon as + /// @ref NYT::TWriterOptions::MinUploadReplicationFactor number of replicas are created. + FLUENT_FIELD_OPTION(bool, EnableEarlyFinish); + + /// Number of replicas to be created. + FLUENT_FIELD_OPTION(ui64, UploadReplicationFactor); + + /// + /// Min number of created replicas needed to consider upload successful. + /// + /// @see NYT::TWriterOptions::EnableEarlyFinish + FLUENT_FIELD_OPTION(ui64, MinUploadReplicationFactor); + + /// + /// @brief Desired size of a chunk. + /// + /// @see @ref NYT::TWriterOptions::RetryBlockSize + FLUENT_FIELD_OPTION(ui64, DesiredChunkSize); + + /// + /// @brief Size of data block accumulated in memory to provide retries. + /// + /// Data is accumulated in memory buffer so in case error occurs data could be resended. + /// + /// If `RetryBlockSize` is not set buffer size is set to `DesiredChunkSize`. + /// If niether `RetryBlockSize` nor `DesiredChunkSize` is set size of buffer is 64MB. + /// + /// @note Written chunks cannot be larger than size of this memory buffer. + /// + /// Since DesiredChunkSize is compared against data already compressed with compression codec + /// it makes sense to set `RetryBlockSize = DesiredChunkSize / ExpectedCompressionRatio` + /// + /// @see @ref NYT::TWriterOptions::DesiredChunkSize + /// @see @ref NYT::TTableWriterOptions::SingleHttpRequest + FLUENT_FIELD_OPTION(size_t, RetryBlockSize); +}; + +/// +/// @brief Options for writing file +/// +/// @see NYT::IIOClient::CreateFileWriter +struct TFileWriterOptions + : public TIOOptions<TFileWriterOptions> +{ + /// + /// @brief Whether to compute MD5 sum of written file. + /// + /// If ComputeMD5 is set to `true` and we are appending to an existing file + /// the `md5` attribute must be set (i.e. it was previously written only with `ComputeMD5 == true`). + FLUENT_FIELD_OPTION(bool, ComputeMD5); + + /// + /// @brief Options to control how YT server side writes data. + /// + /// @see NYT::TWriterOptions + FLUENT_FIELD_OPTION(TWriterOptions, WriterOptions); +}; + +class TSkiffRowHints { +public: + /// @cond Doxygen_Suppress + using TSelf = TSkiffRowHints; + /// @endcond + + /// + /// @brief Library doesn't interpret it, only pass it to CreateSkiffParser<...>() and GetSkiffSchema<...>() functions. + /// + /// You can set something in it to pass necessary information to CreateSkiffParser<...>() and GetSkiffSchema<...>() functions. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// Options that control how C++ objects represent table rows when reading or writing a table. +class TFormatHints +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TFormatHints; + /// @endcond + + /// + /// @brief Whether to skip null values. + /// + /// When set to true TNode doesn't contain null column values + /// (e.g. corresponding keys will be missing instead of containing null value). + /// + /// Only meaningful for TNode representation. + /// + /// Useful for sparse tables which have many columns in schema + /// but only few columns are set in any row. + FLUENT_FIELD_DEFAULT(bool, SkipNullValuesForTNode, false); + + /// + /// @brief Whether to convert string to numeric and boolean types (e.g. "42u" -> 42u, "false" -> %false) + /// when writing to schemaful table. + FLUENT_FIELD_OPTION(bool, EnableStringToAllConversion); + + /// + /// @brief Whether to convert numeric and boolean types to string (e.g., 3.14 -> "3.14", %true -> "true") + /// when writing to schemaful table. + FLUENT_FIELD_OPTION(bool, EnableAllToStringConversion); + + /// + /// @brief Whether to convert uint64 <-> int64 when writing to schemaful table. + /// + /// On overflow the corresponding error with be raised. + /// + /// This options is enabled by default. + FLUENT_FIELD_OPTION(bool, EnableIntegralTypeConversion); + + /// Whether to convert uint64 and int64 to double (e.g. 42 -> 42.0) when writing to schemaful table. + FLUENT_FIELD_OPTION(bool, EnableIntegralToDoubleConversion); + + /// Shortcut for enabling all type conversions. + FLUENT_FIELD_OPTION(bool, EnableTypeConversion); + + /// + /// @brief Controls how complex types are represented in TNode or yson-strings. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/data_types#yson + FLUENT_FIELD_OPTION(EComplexTypeMode, ComplexTypeMode); + + /// + /// @brief Allow to use any meta-information for creating skiff schema and parser for reading ISkiffRow. + FLUENT_FIELD_OPTION(TSkiffRowHints, SkiffRowHints); + + /// + /// @brief Apply the patch to the fields. + /// + /// Non-default and non-empty values replace the default and empty ones. + void Merge(const TFormatHints& patch); +}; + +/// Options that control which control attributes (like row_index) are added to rows during read. +class TControlAttributes +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TControlAttributes; + /// @endcond + + /// + /// @brief Whether to add "row_index" attribute to rows read. + FLUENT_FIELD_DEFAULT(bool, EnableRowIndex, true); + + /// + /// @brief Whether to add "range_index" attribute to rows read. + FLUENT_FIELD_DEFAULT(bool, EnableRangeIndex, true); +}; + +/// Options for @ref NYT::IClient::CreateTableReader +struct TTableReaderOptions + : public TIOOptions<TTableReaderOptions> +{ + /// @deprecated Size of internal client buffer. + FLUENT_FIELD_DEFAULT(size_t, SizeLimit, 4 << 20); + + /// + /// @brief Allows to fine tune format that is used for reading tables. + /// + /// Has no effect when used with raw-reader. + FLUENT_FIELD_OPTION(TFormatHints, FormatHints); + + /// + /// @brief Allows to tune which attributes are added to rows while reading tables. + /// + FLUENT_FIELD_DEFAULT(TControlAttributes, ControlAttributes, TControlAttributes()); +}; + +/// Options for @ref NYT::IClient::CreateTableWriter +struct TTableWriterOptions + : public TIOOptions<TTableWriterOptions> +{ + /// + /// @brief Enable or disable retryful writing. + /// + /// If set to true no retry is made but we also make less requests to master. + /// If set to false writer can make up to `TConfig::RetryCount` attempts to send each block of data. + /// + /// @note Writers' methods might throw strange exceptions that might look like network error + /// when `SingleHttpRequest == true` and YT node encounters an error + /// (due to limitations of HTTP protocol YT node have no chance to report error + /// before it reads the whole input so it just drops the connection). + FLUENT_FIELD_DEFAULT(bool, SingleHttpRequest, false); + + /// + /// @brief Allows to change the size of locally buffered rows before flushing to yt. + /// + /// Used only with @ref NYT::TTableWriterOptions::SingleHttpRequest + FLUENT_FIELD_DEFAULT(size_t, BufferSize, 64 << 20); + + /// + /// @brief Allows to fine tune format that is used for writing tables. + /// + /// Has no effect when used with raw-writer. + FLUENT_FIELD_OPTION(TFormatHints, FormatHints); + + /// @brief Try to infer schema of inexistent table from the type of written rows. + /// + /// @note Default values for this option may differ depending on the row type. + /// For protobuf it's currently false by default. + FLUENT_FIELD_OPTION(bool, InferSchema); + + /// + /// @brief Options to control how YT server side writes data. + /// + /// @see NYT::TWriterOptions + FLUENT_FIELD_OPTION(TWriterOptions, WriterOptions); +}; + +/// +/// @brief Options for @ref NYT::IClient::StartTransaction +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#start_tx +struct TStartTransactionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TStartTransactionOptions; + /// @endcond + + FLUENT_FIELD_DEFAULT(bool, PingAncestors, false); + + /// + /// @brief How long transaction lives after last ping. + /// + /// If server doesn't receive any pings for transaction for this time + /// transaction will be aborted. By default timeout is 15 seconds. + FLUENT_FIELD_OPTION(TDuration, Timeout); + + /// + /// @brief Moment in the future when transaction is aborted. + FLUENT_FIELD_OPTION(TInstant, Deadline); + + /// + /// @brief Whether to ping created transaction automatically. + /// + /// When set to true library creates a thread that pings transaction. + /// When set to false library doesn't ping transaction and it's user responsibility to ping it. + FLUENT_FIELD_DEFAULT(bool, AutoPingable, true); + + /// + /// @brief Set the title attribute of transaction. + /// + /// If title was not specified + /// neither using this option nor using @ref NYT::TStartTransactionOptions::Attributes option + /// library will generate default title for transaction. + /// Such default title includes machine name, pid, user name and some other useful info. + FLUENT_FIELD_OPTION(TString, Title); + + /// + /// @brief Set custom transaction attributes + /// + /// @note @ref NYT::TStartTransactionOptions::Title option overrides `"title"` attribute. + FLUENT_FIELD_OPTION(TNode, Attributes); +}; + +/// +/// @brief Options for attaching transaction. +/// +/// @see NYT::IClient::AttachTransaction +struct TAttachTransactionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAttachTransactionOptions; + /// @endcond + + /// + /// @brief Ping transaction automatically. + /// + /// When set to |true| library creates a thread that pings transaction. + /// When set to |false| library doesn't ping transaction and + /// it's user responsibility to ping it. + FLUENT_FIELD_DEFAULT(bool, AutoPingable, false); + + /// + /// @brief Abort transaction on program termination. + /// + /// Should the transaction be aborted on program termination + /// (either normal or by a signal or uncaught exception -- two latter + /// only if @ref TInitializeOptions::CleanupOnTermination is set). + FLUENT_FIELD_DEFAULT(bool, AbortOnTermination, false); +}; + +/// +/// @brief Type of the lock. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locking_mode +/// @see NYT::ITransaction::Lock +enum ELockMode : int +{ + /// Exclusive lock. + LM_EXCLUSIVE /* "exclusive" */, + + /// Shared lock. + LM_SHARED /* "shared" */, + + /// Snapshot lock. + LM_SNAPSHOT /* "snapshot" */, +}; + +/// +/// @brief Options for locking cypress node +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks +/// @see NYT::ITransaction::Lock +struct TLockOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TLockOptions; + /// @endcond + + /// + /// @brief Whether to wait already locked node to be unlocked. + /// + /// If `Waitable' is set to true Lock method will create + /// waitable lock, that will be taken once other transactions + /// that hold lock to that node are commited / aborted. + /// + /// @note Lock method DOES NOT wait until lock is actually acquired. + /// Waiting should be done using corresponding methods of ILock. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locking_queue + FLUENT_FIELD_DEFAULT(bool, Waitable, false); + + /// + /// @brief Also take attribute_key lock. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility + FLUENT_FIELD_OPTION(TString, AttributeKey); + + /// + /// @brief Also take child_key lock. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility + FLUENT_FIELD_OPTION(TString, ChildKey); +}; + +/// +/// @brief Options for @ref NYT::ITransaction::Unlock +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/transactions#locks_compatibility +struct TUnlockOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TUnlockOptions; + /// @endcond +}; + +/// Base class for options that deal with tablets. +template <class TDerived> +struct TTabletOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// Index of a first tablet to deal with. + FLUENT_FIELD_OPTION(i64, FirstTabletIndex); + + /// Index of a last tablet to deal with. + FLUENT_FIELD_OPTION(i64, LastTabletIndex); +}; + +/// +/// @brief Options for @ref NYT::IClient::MountTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#mount_table +struct TMountTableOptions + : public TTabletOptions<TMountTableOptions> +{ + /// @cond Doxygen_Suppress + using TSelf = TMountTableOptions; + /// @endcond + + /// If specified table will be mounted to this cell. + FLUENT_FIELD_OPTION(TTabletCellId, CellId); + + /// If set to true tablets will be mounted in freezed state. + FLUENT_FIELD_DEFAULT(bool, Freeze, false); +}; + +/// +/// @brief Options for @ref NYT::IClient::UnmountTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#unmount_table +struct TUnmountTableOptions + : public TTabletOptions<TUnmountTableOptions> +{ + /// @cond Doxygen_Suppress + using TSelf = TUnmountTableOptions; + /// @endcond + + /// Advanced option, don't use unless yt team told you so. + FLUENT_FIELD_DEFAULT(bool, Force, false); +}; + +/// +/// @brief Options for @ref NYT::IClient::RemountTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#remount_table +struct TRemountTableOptions + : public TTabletOptions<TRemountTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::ReshardTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#reshard_table +struct TReshardTableOptions + : public TTabletOptions<TReshardTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::FreezeTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#freeze_table +struct TFreezeTableOptions + : public TTabletOptions<TFreezeTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::UnfreezeTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#unfreeze_table +struct TUnfreezeTableOptions + : public TTabletOptions<TUnfreezeTableOptions> +{ }; + +/// +/// @brief Options for @ref NYT::IClient::AlterTable +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#alter_table +struct TAlterTableOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAlterTableOptions; + /// @endcond + + /// Change table schema. + FLUENT_FIELD_OPTION(TTableSchema, Schema); + + /// Alter table between static and dynamic mode. + FLUENT_FIELD_OPTION(bool, Dynamic); + + /// + /// @brief Changes id of upstream replica on metacluster. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables + FLUENT_FIELD_OPTION(TReplicaId, UpstreamReplicaId); +}; + +/// +/// @brief Options for @ref NYT::IClient::LookupRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#lookup_rows +struct TLookupRowsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TLookupRowsOptions; + /// @endcond + + /// Timeout for operation. + FLUENT_FIELD_OPTION(TDuration, Timeout); + + /// Column names to return. + FLUENT_FIELD_OPTION(TColumnNames, Columns); + + /// + /// @brief Whether to return rows that were not found in table. + /// + /// If set to true List returned by LookupRows method will have same + /// length as list of keys. If row is not found in table corresponding item in list + /// will have null value. + FLUENT_FIELD_DEFAULT(bool, KeepMissingRows, false); + + /// If set to true returned values will have "timestamp" attribute. + FLUENT_FIELD_OPTION(bool, Versioned); +}; + +/// +/// @brief Options for @ref NYT::IClient::SelectRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#select_rows +struct TSelectRowsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSelectRowsOptions; + /// @endcond + + /// Timeout for operation. + FLUENT_FIELD_OPTION(TDuration, Timeout); + + /// + /// @brief Limitation for number of rows read by single node. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_OPTION(i64, InputRowLimit); + + /// + /// @brief Limitation for number of output rows on single cluster node. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_OPTION(i64, OutputRowLimit); + + /// + /// @brief Maximum row ranges derived from WHERE clause. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_DEFAULT(ui64, RangeExpansionLimit, 1000); + + /// + /// @brief Whether to fail if InputRowLimit or OutputRowLimit is exceeded. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/dyn_query_language#ogranicheniya-na-slozhnost-zaprosa-(opcii) + FLUENT_FIELD_DEFAULT(bool, FailOnIncompleteResult, true); + + /// @brief Enable verbose logging on server side. + FLUENT_FIELD_DEFAULT(bool, VerboseLogging, false); + + FLUENT_FIELD_DEFAULT(bool, EnableCodeCache, true); +}; + +/// Options for NYT::CreateClient; +struct TCreateClientOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCreateClientOptions; + /// @endcond + + /// @brief Impersonated user name. + /// + /// If authenticated user is allowed to impersonate other YT users (e.g. yql_agent), this field may be used to override user name. + FLUENT_FIELD_OPTION(TString, ImpersonationUser); + + /// @brief User token. + /// + /// @see NYT::TCreateClientOptions::TokenPath + FLUENT_FIELD(TString, Token); + + /// @brief Path to the file where user token is stored. + /// + /// Token is looked in these places in following order: + /// - @ref NYT::TCreateClientOptions::Token + /// - @ref NYT::TCreateClientOptions::TokenPath + /// - `TConfig::Get()->Token` option. + /// - `YT_TOKEN` environment variable + /// - `YT_SECURE_VAULT_YT_TOKEN` environment variable + /// - File specified in `YT_TOKEN_PATH` environment variable + /// - `$HOME/.yt/token` file. + FLUENT_FIELD(TString, TokenPath); + + /// @brief TVM service ticket producer. + /// + /// We store a wrapper of NYT::TIntrusivePtr here (not a NYT::TIntrusivePtr), + /// because otherwise other projects will have build problems + /// because of visibility of two different `TIntrusivePtr`-s (::TInstrusivePtr and NYT::TInstrusivePtr). + /// + /// @see NYT::NAuth::TServiceTicketClientAuth + /// {@ + NAuth::IServiceTicketAuthPtrWrapperPtr ServiceTicketAuth_ = nullptr; + TSelf& ServiceTicketAuth(const NAuth::IServiceTicketAuthPtrWrapper& wrapper); + /// @} + + /// @brief Use tvm-only endpoints in cluster connection. + FLUENT_FIELD_DEFAULT(bool, TvmOnly, false); + + /// @brief Use HTTPs (use HTTP client from yt/yt/core always). + /// + /// @see UseCoreHttpClient + FLUENT_FIELD_DEFAULT(bool, UseTLS, false); + + /// @brief Use HTTP client from yt/yt/core. + FLUENT_FIELD_DEFAULT(bool, UseCoreHttpClient, false); + + /// + /// @brief RetryConfig provider allows to fine tune request retries. + /// + /// E.g. set total timeout for all retries. + FLUENT_FIELD_DEFAULT(IRetryConfigProviderPtr, RetryConfigProvider, nullptr); + + /// @brief Override global config for the client. + /// + /// The config contains implementation parameters such as connection timeouts, + /// access token, api version and more. + /// @see NYT::TConfig + FLUENT_FIELD_DEFAULT(TConfigPtr, Config, nullptr); +}; + +/// +/// @brief Options for @ref NYT::IBatchRequest::ExecuteBatch +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#execute_batch +struct TExecuteBatchOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TExecuteBatchOptions; + /// @endcond + + /// + /// @brief How many requests will be executed in parallel on the cluster. + /// + /// This parameter could be used to avoid RequestLimitExceeded errors. + FLUENT_FIELD_OPTION(ui64, Concurrency); + + /// + /// @brief Maximum size of batch sent in one request to server. + /// + /// Huge batches are executed using multiple requests. + /// BatchPartMaxSize is maximum size of single request that goes to server + /// If not specified it is set to `Concurrency * 5' + FLUENT_FIELD_OPTION(ui64, BatchPartMaxSize); +}; + +/// +/// @brief Durability mode. +/// +/// @see NYT::TTabletTransactionOptions::TDurability +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#sohrannost +enum class EDurability +{ + /// Sync mode (default). + Sync /* "sync" */, + + /// Async mode (might reduce latency of write requests, but less reliable). + Async /* "async" */, +}; + +/// +/// @brief Atomicity mode. +/// +/// @see NYT::TTabletTransactionOptions::TDurability +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#sohrannost +enum class EAtomicity +{ + /// Transactions are non atomic (might reduce latency of write requests). + None /* "none" */, + + /// Transactions are atomic (default). + Full /* "full" */, +}; + +/// +/// @brief Table replica mode. +/// +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#atributy +enum class ETableReplicaMode +{ + Sync /* "sync" */, + Async /* "async" */, +}; + +/// Base class for options dealing with io to dynamic tables. +template <typename TDerived> +struct TTabletTransactionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Atomicity mode of operation + /// + /// Setting to NYT::EAtomicity::None allows to improve latency of operations + /// at the cost of weakening contracts. + /// + /// @note Use with care. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#oslablenie-garantij + FLUENT_FIELD_OPTION(EAtomicity, Atomicity); + + /// + /// @brief Durability mode of operation + /// + /// Setting to NYT::EDurability::Async allows to improve latency of operations + /// at the cost of weakening contracts. + /// + /// @note Use with care. + /// + /// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#oslablenie-garantij + FLUENT_FIELD_OPTION(EDurability, Durability); +}; + +/// +/// @brief Options for NYT::IClient::InsertRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#insert_rows +struct TInsertRowsOptions + : public TTabletTransactionOptions<TInsertRowsOptions> +{ + /// + /// @brief Whether to overwrite missing columns with nulls. + /// + /// By default all columns missing in input data are set to Null and overwrite currently stored value. + /// If `Update' is set to true currently stored value will not be overwritten for columns that are missing in input data. + FLUENT_FIELD_OPTION(bool, Update); + + /// + /// @brief Whether to overwrite or aggregate aggregated columns. + /// + /// Used with aggregating columns. + /// By default value in aggregating column will be overwritten. + /// If `Aggregate' is set to true row will be considered as delta and it will be aggregated with currently stored value. + FLUENT_FIELD_OPTION(bool, Aggregate); + + /// + /// @brief Whether to fail when inserting to table without sync replica. + /// + /// Used for insert operation for tables without sync replica. + /// https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#write + /// Default value is 'false'. So insertion into table without sync replicas fails. + FLUENT_FIELD_OPTION(bool, RequireSyncReplica); +}; + +/// +/// @brief Options for NYT::IClient::DeleteRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#delete_rows +struct TDeleteRowsOptions + : public TTabletTransactionOptions<TDeleteRowsOptions> +{ + /// + /// @brief Whether to fail when deleting from table without sync replica. + /// + // Used for delete operation for tables without sync replica. + // https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables#write + // Default value is 'false'. So deletion into table without sync replicas fails. + FLUENT_FIELD_OPTION(bool, RequireSyncReplica); +}; + +/// +/// @brief Options for NYT::IClient::TrimRows +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#trim_rows +struct TTrimRowsOptions + : public TTabletTransactionOptions<TTrimRowsOptions> +{ }; + +/// @brief Options for NYT::IClient::AlterTableReplica +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#alter_table_replica +/// @see https://yt.yandex-team.ru/docs/description/dynamic_tables/replicated_dynamic_tables +struct TAlterTableReplicaOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAlterTableReplicaOptions; + /// @endcond + + /// + /// @brief Whether to enable or disable replica. + /// + /// Doesn't change state of replica if `Enabled' is not set. + FLUENT_FIELD_OPTION(bool, Enabled); + + /// + /// @brief Change replica mode. + /// + /// Doesn't change replica mode if `Mode` is not set. + FLUENT_FIELD_OPTION(ETableReplicaMode, Mode); +}; + +/// +/// @brief Options for @ref NYT::IClient::GetFileFromCache +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_file_from_cache +struct TGetFileFromCacheOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetFileFromCacheOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTableColumnarStatistics +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#put_file_to_cache +struct TPutFileToCacheOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TPutFileToCacheOptions; + /// @endcond + + /// Whether to preserve `expiration_timeout` attribute of source node. + FLUENT_FIELD_OPTION(bool, PreserveExpirationTimeout); +}; + +/// +/// Type of permission used in ACL. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/access_control +enum class EPermission : int +{ + /// Applies to: all objects. + Read /* "read" */, + + /// Applies to: all objects. + Write /* "write" */, + + /// Applies to: accounts / pools. + Use /* "use" */, + + /// Applies to: all objects. + Administer /* "administer" */, + + /// Applies to: schemas. + Create /* "create" */, + + /// Applies to: all objects. + Remove /* "remove" */, + + /// Applies to: tables. + Mount /* "mount" */, + + /// Applies to: operations. + Manage /* "manage" */, +}; + +/// Whether permission is granted or denied. +enum class ESecurityAction : int +{ + /// Permission is granted. + Allow /* "allow" */, + + /// Permission is denied. + Deny /* "deny" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::CheckPermission +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#check_permission +struct TCheckPermissionOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TCheckPermissionOptions; + /// @endcond + + /// Columns to check permission to (for tables only). + FLUENT_VECTOR_FIELD(TString, Column); +}; + +/// +/// @brief Columnar statistics fetching mode. +/// +/// @ref NYT::TGetTableColumnarStatisticsOptions::FetcherMode +enum class EColumnarStatisticsFetcherMode +{ + /// Slow mode for fetching precise columnar statistics. + FromNodes /* "from_nodes" */, + + /// + /// @brief Fast mode for fetching lightweight columnar statistics. + /// + /// Relative precision is 1 / 256. + /// + /// @note Might be unavailable for old tables in that case some upper bound is returned. + FromMaster /* "from_master" */, + + /// Use lightweight columnar statistics (FromMaster) if available otherwise switch to slow but precise mode (FromNodes). + Fallback /* "fallback" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTableColumnarStatistics +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_table_columnar_statistics +struct TGetTableColumnarStatisticsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetTableColumnarStatisticsOptions; + /// @endcond + + /// + /// @brief Mode of statistics fetching. + /// + /// @ref NYT::EColumnarStatisticsFetcherMode + FLUENT_FIELD_OPTION(EColumnarStatisticsFetcherMode, FetcherMode); +}; + +/// +/// @brief Table partitioning mode. +/// +/// @ref NYT::TGetTablePartitionsOptions::PartitionMode +enum class ETablePartitionMode +{ + /// + /// @brief Ignores the order of input tables and their chunk and sorting orders. + /// + Unordered /* "unordered" */, + + /// + /// @brief The order of table ranges inside each partition obey the order of input tables and their chunk orders. + /// + Ordered /* "ordered" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTablePartitions +/// +struct TGetTablePartitionsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetTablePartitionsOptions; + /// @endcond + + /// + /// @brief Table partitioning mode. + /// + /// @ref NYT::ETablePartitionMode + FLUENT_FIELD(ETablePartitionMode, PartitionMode); + + /// + /// @brief Approximate data weight of each output partition. + /// + FLUENT_FIELD(i64, DataWeightPerPartition); + + /// + /// @brief Maximum output partition count. + /// + /// Consider the situation when the `MaxPartitionCount` is given + /// and the total data weight exceeds `MaxPartitionCount * DataWeightPerPartition`. + /// If `AdjustDataWeightPerPartition` is |true| + /// `GetTablePartitions` will yield partitions exceeding the `DataWeightPerPartition`. + /// If `AdjustDataWeightPerPartition` is |false| + /// the partitioning will be aborted as soon as the output partition count exceeds this limit. + FLUENT_FIELD_OPTION(int, MaxPartitionCount); + + /// + /// @brief Allow the data weight per partition to exceed `DataWeightPerPartition` when `MaxPartitionCount` is set. + /// + /// |True| by default. + FLUENT_FIELD_DEFAULT(bool, AdjustDataWeightPerPartition, true); +}; + +/// +/// @brief Options for @ref NYT::IClient::GetTabletInfos +/// +/// @note They are empty for now but options might appear in the future. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#get_tablet_infos +struct TGetTabletInfosOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetTabletInfosOptions; + /// @endcond +}; + +/// Options for @ref NYT::IClient::SkyShareTable +struct TSkyShareTableOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TSkyShareTableOptions; + /// @endcond + + /// + /// @brief Key columns that are used to group files in a table into torrents. + /// + /// One torrent is created for each value of `KeyColumns` columns. + /// If not specified, all files go into single torrent. + FLUENT_FIELD_OPTION(TColumnNames, KeyColumns); + + /// @brief Allow skynet manager to return fastbone links to skynet. See YT-11437 + FLUENT_FIELD_OPTION(bool, EnableFastbone); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/common.cpp b/yt/cpp/mapreduce/interface/common.cpp new file mode 100644 index 0000000000..f6d60127ce --- /dev/null +++ b/yt/cpp/mapreduce/interface/common.cpp @@ -0,0 +1,664 @@ +#include "common.h" + +#include "errors.h" +#include "format.h" +#include "serialize.h" +#include "fluent.h" + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <library/cpp/yson/node/node_builder.h> +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/type_info/type.h> + +#include <util/generic/xrange.h> + +namespace NYT { + +using ::google::protobuf::FieldDescriptor; +using ::google::protobuf::Descriptor; + +//////////////////////////////////////////////////////////////////////////////// + +TSortColumn::TSortColumn(TStringBuf name, ESortOrder sortOrder) + : Name_(name) + , SortOrder_(sortOrder) +{ } + +TSortColumn::TSortColumn(const TString& name, ESortOrder sortOrder) + : TSortColumn(static_cast<TStringBuf>(name), sortOrder) +{ } + +TSortColumn::TSortColumn(const char* name, ESortOrder sortOrder) + : TSortColumn(static_cast<TStringBuf>(name), sortOrder) +{ } + +const TSortColumn& TSortColumn::EnsureAscending() const +{ + Y_ENSURE(SortOrder() == ESortOrder::SO_ASCENDING); + return *this; +} + +TNode TSortColumn::ToNode() const +{ + return BuildYsonNodeFluently().Value(*this); +} + +//////////////////////////////////////////////////////////////////////////////// +// Below lie backward compatibility methods. +//////////////////////////////////////////////////////////////////////////////// + +TSortColumn& TSortColumn::operator = (TStringBuf name) +{ + EnsureAscending(); + Name_ = name; + return *this; +} + +TSortColumn& TSortColumn::operator = (const TString& name) +{ + return (*this = static_cast<TStringBuf>(name)); +} + +TSortColumn& TSortColumn::operator = (const char* name) +{ + return (*this = static_cast<TStringBuf>(name)); +} + +bool TSortColumn::operator == (TStringBuf rhsName) const +{ + EnsureAscending(); + return Name_ == rhsName; +} + +bool TSortColumn::operator != (TStringBuf rhsName) const +{ + return !(*this == rhsName); +} + +bool TSortColumn::operator == (const TString& rhsName) const +{ + return *this == static_cast<TStringBuf>(rhsName); +} + +bool TSortColumn::operator != (const TString& rhsName) const +{ + return !(*this == rhsName); +} + +bool TSortColumn::operator == (const char* rhsName) const +{ + return *this == static_cast<TStringBuf>(rhsName); +} + +bool TSortColumn::operator != (const char* rhsName) const +{ + return !(*this == rhsName); +} + +TSortColumn::operator TStringBuf() const +{ + EnsureAscending(); + return Name_; +} + +TSortColumn::operator TString() const +{ + return TString(static_cast<TStringBuf>(*this)); +} + +TSortColumn::operator std::string() const +{ + EnsureAscending(); + return static_cast<std::string>(Name_); +} + +//////////////////////////////////////////////////////////////////////////////// + +TSortColumns::TSortColumns() +{ } + +TSortColumns::TSortColumns(const TVector<TString>& names) +{ + Parts_.assign(names.begin(), names.end()); +} + +TSortColumns::TSortColumns(const TColumnNames& names) + : TSortColumns(names.Parts_) +{ } + +TSortColumns::operator TColumnNames() const +{ + return TColumnNames(EnsureAscending().GetNames()); +} + +const TSortColumns& TSortColumns::EnsureAscending() const +{ + for (const auto& sortColumn : Parts_) { + sortColumn.EnsureAscending(); + } + return *this; +} + +TVector<TString> TSortColumns::GetNames() const +{ + TVector<TString> names; + names.reserve(Parts_.size()); + for (const auto& sortColumn : Parts_) { + names.push_back(sortColumn.Name()); + } + return names; +} + +//////////////////////////////////////////////////////////////////////////////// + +static NTi::TTypePtr OldTypeToTypeV3(EValueType type) +{ + switch (type) { + case VT_INT64: + return NTi::Int64(); + case VT_UINT64: + return NTi::Uint64(); + + case VT_DOUBLE: + return NTi::Double(); + + case VT_BOOLEAN: + return NTi::Bool(); + + case VT_STRING: + return NTi::String(); + + case VT_ANY: + return NTi::Yson(); + + case VT_INT8: + return NTi::Int8(); + case VT_INT16: + return NTi::Int16(); + case VT_INT32: + return NTi::Int32(); + + case VT_UINT8: + return NTi::Uint8(); + case VT_UINT16: + return NTi::Uint16(); + case VT_UINT32: + return NTi::Uint32(); + + case VT_UTF8: + return NTi::Utf8(); + + case VT_NULL: + return NTi::Null(); + + case VT_VOID: + return NTi::Void(); + + case VT_DATE: + return NTi::Date(); + case VT_DATETIME: + return NTi::Datetime(); + case VT_TIMESTAMP: + return NTi::Timestamp(); + case VT_INTERVAL: + return NTi::Interval(); + + case VT_FLOAT: + return NTi::Float(); + case VT_JSON: + return NTi::Json(); + } +} + +static std::pair<EValueType, bool> Simplify(const NTi::TTypePtr& type) +{ + using namespace NTi; + const auto typeName = type->GetTypeName(); + switch (typeName) { + case ETypeName::Bool: + return {VT_BOOLEAN, true}; + + case ETypeName::Int8: + return {VT_INT8, true}; + case ETypeName::Int16: + return {VT_INT16, true}; + case ETypeName::Int32: + return {VT_INT32, true}; + case ETypeName::Int64: + return {VT_INT64, true}; + + case ETypeName::Uint8: + return {VT_UINT8, true}; + case ETypeName::Uint16: + return {VT_UINT16, true}; + case ETypeName::Uint32: + return {VT_UINT32, true}; + case ETypeName::Uint64: + return {VT_UINT64, true}; + + case ETypeName::Float: + return {VT_FLOAT, true}; + case ETypeName::Double: + return {VT_DOUBLE, true}; + + case ETypeName::String: + return {VT_STRING, true}; + case ETypeName::Utf8: + return {VT_UTF8, true}; + + case ETypeName::Date: + return {VT_DATE, true}; + case ETypeName::Datetime: + return {VT_DATETIME, true}; + case ETypeName::Timestamp: + return {VT_TIMESTAMP, true}; + case ETypeName::Interval: + return {VT_INTERVAL, true}; + + case ETypeName::TzDate: + case ETypeName::TzDatetime: + case ETypeName::TzTimestamp: + break; + + case ETypeName::Json: + return {VT_JSON, true}; + case ETypeName::Decimal: + return {VT_STRING, true}; + case ETypeName::Uuid: + break; + case ETypeName::Yson: + return {VT_ANY, true}; + + case ETypeName::Void: + return {VT_VOID, false}; + case ETypeName::Null: + return {VT_NULL, false}; + + case ETypeName::Optional: + { + auto itemType = type->AsOptional()->GetItemType(); + if (itemType->IsPrimitive()) { + auto simplified = Simplify(itemType->AsPrimitive()); + if (simplified.second) { + simplified.second = false; + return simplified; + } + } + return {VT_ANY, false}; + } + case ETypeName::List: + return {VT_ANY, true}; + case ETypeName::Dict: + return {VT_ANY, true}; + case ETypeName::Struct: + return {VT_ANY, true}; + case ETypeName::Tuple: + return {VT_ANY, true}; + case ETypeName::Variant: + return {VT_ANY, true}; + case ETypeName::Tagged: + return Simplify(type->AsTagged()->GetItemType()); + } + ythrow TApiUsageError() << "Unsupported type: " << typeName; +} + +NTi::TTypePtr ToTypeV3(EValueType type, bool required) +{ + auto typeV3 = OldTypeToTypeV3(type); + if (!Simplify(typeV3).second) { + if (required) { + ythrow TApiUsageError() << "type: " << type << " cannot be required"; + } else { + return typeV3; + } + } + if (required) { + return typeV3; + } else { + return NTi::Optional(typeV3); + } +} + +TColumnSchema::TColumnSchema() + : TypeV3_(NTi::Optional(NTi::Int64())) +{ } + +EValueType TColumnSchema::Type() const +{ + return Simplify(TypeV3_).first; +} + +TColumnSchema& TColumnSchema::Type(EValueType type) & +{ + return Type(ToTypeV3(type, false)); +} + +TColumnSchema TColumnSchema::Type(EValueType type) && +{ + return Type(ToTypeV3(type, false)); +} + +TColumnSchema& TColumnSchema::Type(const NTi::TTypePtr& type) & +{ + Y_VERIFY(type.Get(), "Cannot create column schema with nullptr type"); + TypeV3_ = type; + return *this; +} + +TColumnSchema TColumnSchema::Type(const NTi::TTypePtr& type) && +{ + Y_VERIFY(type.Get(), "Cannot create column schema with nullptr type"); + TypeV3_ = type; + return *this; +} + +TColumnSchema& TColumnSchema::TypeV3(const NTi::TTypePtr& type) & +{ + return Type(type); +} + +TColumnSchema TColumnSchema::TypeV3(const NTi::TTypePtr& type) && +{ + return Type(type); +} + +NTi::TTypePtr TColumnSchema::TypeV3() const +{ + return TypeV3_; +} + +bool TColumnSchema::Required() const +{ + return Simplify(TypeV3_).second; +} + +TColumnSchema& TColumnSchema::Type(EValueType type, bool required) & +{ + return Type(ToTypeV3(type, required)); +} + +TColumnSchema TColumnSchema::Type(EValueType type, bool required) && +{ + return Type(ToTypeV3(type, required)); +} + +bool operator==(const TColumnSchema& lhs, const TColumnSchema& rhs) +{ + return + lhs.Name() == rhs.Name() && + NTi::NEq::TStrictlyEqual()(lhs.TypeV3(), rhs.TypeV3()) && + lhs.SortOrder() == rhs.SortOrder() && + lhs.Lock() == rhs.Lock() && + lhs.Expression() == rhs.Expression() && + lhs.Aggregate() == rhs.Aggregate() && + lhs.Group() == rhs.Group(); +} + +//////////////////////////////////////////////////////////////////////////////// + +bool TTableSchema::Empty() const +{ + return Columns_.empty(); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, EValueType type) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, EValueType type) && +{ + return std::move(AddColumn(name, type)); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type).SortOrder(sortOrder)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) && +{ + return std::move(AddColumn(name, type, sortOrder)); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type) && +{ + return std::move(AddColumn(name, type)); +} + +TTableSchema& TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) & +{ + Columns_.push_back(TColumnSchema().Name(name).Type(type).SortOrder(sortOrder)); + return *this; +} + +TTableSchema TTableSchema::AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) && +{ + return std::move(AddColumn(name, type, sortOrder)); +} + +TTableSchema& TTableSchema::SortBy(const TSortColumns& sortColumns) & +{ + Y_ENSURE(sortColumns.Parts_.size() <= Columns_.size()); + + THashMap<TString, ui64> sortColumnIndex; + for (auto i: xrange(sortColumns.Parts_.size())) { + Y_ENSURE(sortColumnIndex.emplace(sortColumns.Parts_[i].Name(), i).second, + "Key column name '" << sortColumns.Parts_[i].Name() << "' repeats in columns list"); + } + + TVector<TColumnSchema> newColumnsSorted(sortColumns.Parts_.size()); + TVector<TColumnSchema> newColumnsUnsorted; + for (auto& column : Columns_) { + auto it = sortColumnIndex.find(column.Name()); + if (it == sortColumnIndex.end()) { + column.ResetSortOrder(); + newColumnsUnsorted.push_back(std::move(column)); + } else { + auto index = it->second; + const auto& sortColumn = sortColumns.Parts_[index]; + column.SortOrder(sortColumn.SortOrder()); + newColumnsSorted[index] = std::move(column); + sortColumnIndex.erase(it); + } + } + + Y_ENSURE(sortColumnIndex.empty(), "Column name '" << sortColumnIndex.begin()->first + << "' not found in table schema"); + + newColumnsSorted.insert(newColumnsSorted.end(), newColumnsUnsorted.begin(), newColumnsUnsorted.end()); + Columns_ = std::move(newColumnsSorted); + + return *this; +} + +TTableSchema TTableSchema::SortBy(const TSortColumns& sortColumns) && +{ + return std::move(SortBy(sortColumns)); +} + +TVector<TColumnSchema>& TTableSchema::MutableColumns() +{ + return Columns_; +} + +TNode TTableSchema::ToNode() const +{ + TNode result; + TNodeBuilder builder(&result); + Serialize(*this, &builder); + return result; +} + +TTableSchema TTableSchema::FromNode(const TNode& node) +{ + TTableSchema schema; + Deserialize(schema, node); + return schema; +} + +bool operator==(const TTableSchema& lhs, const TTableSchema& rhs) +{ + return + lhs.Columns() == rhs.Columns() && + lhs.Strict() == rhs.Strict() && + lhs.UniqueKeys() == rhs.UniqueKeys(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TKeyBound::TKeyBound(ERelation relation, TKey key) + : Relation_(relation) + , Key_(std::move(key)) +{ } + +//////////////////////////////////////////////////////////////////////////////// + +TTableSchema CreateTableSchema( + const Descriptor& messageDescriptor, + const TSortColumns& sortColumns, + bool keepFieldsWithoutExtension) +{ + auto result = CreateTableSchema(messageDescriptor, keepFieldsWithoutExtension); + if (!sortColumns.Parts_.empty()) { + result.SortBy(sortColumns.Parts_); + } + return result; +} + +TTableSchema CreateTableSchema(NTi::TTypePtr type) +{ + Y_VERIFY(type); + TTableSchema schema; + Deserialize(schema, NodeFromYsonString(NTi::NIo::AsYtSchema(type.Get()))); + return schema; +} + +//////////////////////////////////////////////////////////////////////////////// + +bool IsTrivial(const TReadLimit& readLimit) +{ + return !readLimit.Key_ && !readLimit.RowIndex_ && !readLimit.Offset_ && !readLimit.TabletIndex_ && !readLimit.KeyBound_; +} + +EValueType NodeTypeToValueType(TNode::EType nodeType) +{ + switch (nodeType) { + case TNode::EType::Int64: return VT_INT64; + case TNode::EType::Uint64: return VT_UINT64; + case TNode::EType::String: return VT_STRING; + case TNode::EType::Double: return VT_DOUBLE; + case TNode::EType::Bool: return VT_BOOLEAN; + default: + ythrow yexception() << "Cannot convert TNode type " << nodeType << " to EValueType"; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +const TVector<TReadRange>& GetRangesCompat(const TRichYPath& path) +{ + static const TVector<TReadRange> empty; + + const auto& maybeRanges = path.GetRanges(); + if (maybeRanges.Empty()) { + return empty; + } else if (maybeRanges->size() > 0) { + return *maybeRanges; + } else { + // If you see this exception, that means that caller of this function doesn't known what to do + // with RichYPath that has set range list, but the range list is empty. + // + // To avoid this exception caller must explicitly handle such case. + // NB. YT-17683 + ythrow TApiUsageError() << "Unsupported RichYPath: explicitly empty range list"; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +TString ToString(EValueType type) +{ + switch (type) { + case VT_INT8: + return "int8"; + case VT_INT16: + return "int16"; + case VT_INT32: + return "int32"; + case VT_INT64: + return "int64"; + + case VT_UINT8: + return "uint8"; + case VT_UINT16: + return "uint16"; + case VT_UINT32: + return "uint32"; + case VT_UINT64: + return "uint64"; + + case VT_DOUBLE: + return "double"; + + case VT_BOOLEAN: + return "boolean"; + + case VT_STRING: + return "string"; + case VT_UTF8: + return "utf8"; + + case VT_ANY: + return "any"; + + case VT_NULL: + return "null"; + case VT_VOID: + return "void"; + + case VT_DATE: + return "date"; + case VT_DATETIME: + return "datetime"; + case VT_TIMESTAMP: + return "timestamp"; + case VT_INTERVAL: + return "interval"; + + case VT_FLOAT: + return "float"; + + case VT_JSON: + return "json"; + } + ythrow yexception() << "Invalid value type " << static_cast<int>(type); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +} // namespace NYT + +template <> +void Out<NYT::TSortColumn>(IOutputStream& os, const NYT::TSortColumn& sortColumn) +{ + if (sortColumn.SortOrder() == NYT::ESortOrder::SO_ASCENDING) { + os << sortColumn.Name(); + } else { + os << NYT::BuildYsonStringFluently(NYson::EYsonFormat::Text).Value(sortColumn); + } +} diff --git a/yt/cpp/mapreduce/interface/common.h b/yt/cpp/mapreduce/interface/common.h new file mode 100644 index 0000000000..b1754ade70 --- /dev/null +++ b/yt/cpp/mapreduce/interface/common.h @@ -0,0 +1,1301 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/common.h +/// +/// Header containing miscellaneous structs and classes used in library. + +#include "fwd.h" + +#include <library/cpp/type_info/type_info.h> +#include <library/cpp/yson/node/node.h> + +#include <util/generic/guid.h> +#include <util/generic/map.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> +#include <util/system/type_name.h> +#include <util/generic/vector.h> + +#include <google/protobuf/message.h> + +#include <initializer_list> +#include <type_traits> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// @cond Doxygen_Suppress +#define FLUENT_FIELD(type, name) \ + type name##_; \ + TSelf& name(const type& value) \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + static_assert(true) + +#define FLUENT_FIELD_ENCAPSULATED(type, name) \ +private: \ + type name##_; \ +public: \ + TSelf& name(const type& value) & \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf name(const type& value) && \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + const type& name() const & \ + { \ + return name##_; \ + } \ + type name() && \ + { \ + return name##_; \ + } \ + static_assert(true) + +#define FLUENT_FIELD_OPTION(type, name) \ + TMaybe<type> name##_; \ + TSelf& name(const type& value) \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + static_assert(true) + +#define FLUENT_FIELD_OPTION_ENCAPSULATED(type, name) \ +private: \ + TMaybe<type> name##_; \ +public: \ + TSelf& name(const type& value) & \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf name(const type& value) && \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf& Reset##name() & \ + { \ + name##_ = Nothing(); \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf Reset##name() && \ + { \ + name##_ = Nothing(); \ + return static_cast<TSelf&>(*this); \ + } \ + const TMaybe<type>& name() const& \ + { \ + return name##_; \ + } \ + TMaybe<type> name() && \ + { \ + return name##_; \ + } \ + static_assert(true) + +#define FLUENT_FIELD_DEFAULT(type, name, defaultValue) \ + type name##_ = defaultValue; \ + TSelf& name(const type& value) \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + static_assert(true) + +#define FLUENT_FIELD_DEFAULT_ENCAPSULATED(type, name, defaultValue) \ +private: \ + type name##_ = defaultValue; \ +public: \ + TSelf& name(const type& value) & \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + TSelf name(const type& value) && \ + { \ + name##_ = value; \ + return static_cast<TSelf&>(*this); \ + } \ + const type& name() const & \ + { \ + return name##_; \ + } \ + type name() && \ + { \ + return name##_; \ + } \ + static_assert(true) + +#define FLUENT_VECTOR_FIELD(type, name) \ + TVector<type> name##s_; \ + TSelf& Add##name(const type& value) \ + { \ + name##s_.push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf& name##s(TVector<type> values) \ + { \ + name##s_ = std::move(values); \ + return static_cast<TSelf&>(*this);\ + } \ + static_assert(true) + +#define FLUENT_OPTIONAL_VECTOR_FIELD_ENCAPSULATED(type, name) \ +private: \ + TMaybe<TVector<type>> name##s_; \ +public: \ + const TMaybe<TVector<type>>& name##s() const & { \ + return name##s_; \ + } \ + TMaybe<TVector<type>>& name##s() & { \ + return name##s_; \ + } \ + TMaybe<TVector<type>> name##s() && { \ + return std::move(name##s_); \ + } \ + TSelf& Add##name(const type& value) & \ + { \ + if (name##s_.Empty()) { \ + name##s_.ConstructInPlace(); \ + } \ + name##s_->push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf Add##name(const type& value) && \ + { \ + if (name##s_.Empty()) { \ + name##s_.ConstructInPlace(); \ + } \ + name##s_->push_back(value); \ + return static_cast<TSelf&&>(*this);\ + } \ + TSelf& name##s(TVector<type> values) & \ + { \ + name##s_ = std::move(values); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf name##s(TVector<type> values) && \ + { \ + name##s_ = std::move(values); \ + return static_cast<TSelf&&>(*this);\ + } \ + TSelf& name##s(TNothing) & \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf name##s(TNothing) && \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&&>(*this);\ + } \ + TSelf& Reset##name##s() & \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf Reset##name##s() && \ + { \ + name##s_ = Nothing(); \ + return static_cast<TSelf&&>(*this);\ + } \ + static_assert(true) + +#define FLUENT_VECTOR_FIELD_ENCAPSULATED(type, name) \ +private: \ + TVector<type> name##s_; \ +public: \ + TSelf& Add##name(const type& value) & \ + { \ + name##s_.push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf Add##name(const type& value) && \ + { \ + name##s_.push_back(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf& name##s(TVector<type> value) & \ + { \ + name##s_ = std::move(value); \ + return static_cast<TSelf&>(*this);\ + } \ + TSelf name##s(TVector<type> value) && \ + { \ + name##s_ = std::move(value); \ + return static_cast<TSelf&>(*this);\ + } \ + const TVector<type>& name##s() const & \ + { \ + return name##s_; \ + } \ + TVector<type> name##s() && \ + { \ + return name##s_; \ + } \ + static_assert(true) + +#define FLUENT_MAP_FIELD(keytype, valuetype, name) \ + TMap<keytype,valuetype> name##_; \ + TSelf& Add##name(const keytype& key, const valuetype& value) \ + { \ + name##_.emplace(key, value); \ + return static_cast<TSelf&>(*this);\ + } \ + static_assert(true) + +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Convenience class that keeps sequence of items. +/// +/// Designed to be used as function parameter. +/// +/// Users of such function can then pass: +/// - single item, +/// - initializer list of items, +/// - vector of items; +/// as argument to this function. +/// +/// Example: +/// ``` +/// void Foo(const TOneOrMany<int>& arg); +/// ... +/// Foo(1); // ok +/// Foo({1, 2, 3}); // ok +/// ``` +template <class T, class TDerived> +struct TOneOrMany +{ + /// @cond Doxygen_Suppress + using TSelf = std::conditional_t<std::is_void_v<TDerived>, TOneOrMany, TDerived>; + /// @endcond + + /// Initialize with empty sequence. + TOneOrMany() = default; + + // Initialize from initializer list. + template<class U> + TOneOrMany(std::initializer_list<U> il) + { + Parts_.assign(il.begin(), il.end()); + } + + /// Put arguments to sequence + template <class U, class... TArgs> + requires std::is_convertible_v<U, T> + TOneOrMany(U&& arg, TArgs&&... args) + { + Add(arg, std::forward<TArgs>(args)...); + } + + /// Initialize from vector. + TOneOrMany(TVector<T> args) + : Parts_(std::move(args)) + { } + + /// @brief Order is defined the same way as in TVector + bool operator==(const TOneOrMany& rhs) const + { + // N.B. We would like to make this method to be `= default`, + // but this breaks MSVC compiler for the cases when T doesn't + // support comparison. + return Parts_ == rhs.Parts_; + } + + /// + /// @{ + /// + /// @brief Add all arguments to sequence + template <class U, class... TArgs> + requires std::is_convertible_v<U, T> + TSelf& Add(U&& part, TArgs&&... args) & + { + Parts_.push_back(std::forward<U>(part)); + if constexpr (sizeof...(args) > 0) { + [[maybe_unused]] int dummy[sizeof...(args)] = {(Parts_.push_back(std::forward<TArgs>(args)), 0) ... }; + } + return static_cast<TSelf&>(*this); + } + + template <class U, class... TArgs> + requires std::is_convertible_v<U, T> + TSelf Add(U&& part, TArgs&&... args) && + { + return std::move(Add(std::forward<U>(part), std::forward<TArgs>(args)...)); + } + /// @} + + /// Content of sequence. + TVector<T> Parts_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Type of the value that can occur in YT table. +/// +/// @ref NYT::TTableSchema +/// https://yt.yandex-team.ru/docs/description/storage/data_types +enum EValueType : int +{ + /// Int64, signed integer of 64 bits. + VT_INT64, + + /// Uint64, unsigned integer of 64 bits. + VT_UINT64, + + /// Double, floating point number of double precision (64 bits). + VT_DOUBLE, + /// Boolean, `true` or `false`. + VT_BOOLEAN, + + /// String, arbitrary byte sequence. + VT_STRING, + + /// Any, arbitrary yson document. + VT_ANY, + + /// Int8, signed integer of 8 bits. + VT_INT8, + /// Int16, signed integer of 16 bits. + VT_INT16, + /// Int32, signed integer of 32 bits. + VT_INT32, + + /// Uint8, unsigned integer of 8 bits. + VT_UINT8, + /// Uint16, unsigned integer of 16 bits. + VT_UINT16, + /// Uint32, unsigned integer of 32 bits. + VT_UINT32, + + /// Utf8, byte sequence that is valid utf8. + VT_UTF8, + + /// Null, absence of value (almost never used in schemas) + VT_NULL, + /// Void, absence of value (almost never used in schemas) the difference between null, and void is yql-specific. + VT_VOID, + + /// Date, number of days since Unix epoch (unsigned) + VT_DATE, + /// Datetime, number of seconds since Unix epoch (unsigned) + VT_DATETIME, + /// Timestamp, number of milliseconds since Unix epoch (unsigned) + VT_TIMESTAMP, + /// Interval, difference between two timestamps (signed) + VT_INTERVAL, + + /// Float, floating point number (32 bits) + VT_FLOAT, + /// Json, sequence of bytes that is valid json. + VT_JSON, +}; + +/// +/// @brief Sort order. +/// +/// @ref NYT::TTableSchema +enum ESortOrder : int +{ + /// Ascending sort order. + SO_ASCENDING /* "ascending" */, + /// Descending sort order. + SO_DESCENDING /* "descending" */, +}; + +/// +/// @brief Value of "optimize_for" attribute. +/// +/// @ref NYT::TRichYPath +enum EOptimizeForAttr : i8 +{ + /// Optimize for scan + OF_SCAN_ATTR /* "scan" */, + + /// Optimize for lookup + OF_LOOKUP_ATTR /* "lookup" */, +}; + +/// +/// @brief Value of "erasure_codec" attribute. +/// +/// @ref NYT::TRichYPath +enum EErasureCodecAttr : i8 +{ + /// @cond Doxygen_Suppress + EC_NONE_ATTR /* "none" */, + EC_REED_SOLOMON_6_3_ATTR /* "reed_solomon_6_3" */, + EC_LRC_12_2_2_ATTR /* "lrc_12_2_2" */, + EC_ISA_LRC_12_2_2_ATTR /* "isa_lrc_12_2_2" */, + /// @endcond +}; + +/// +/// @brief Value of "schema_modification" attribute. +/// +/// @ref NYT::TRichYPath +enum ESchemaModificationAttr : i8 +{ + SM_NONE_ATTR /* "none" */, + SM_UNVERSIONED_UPDATE /* "unversioned_update" */, +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Table key column description. +/// +/// The description includes column name and sort order. +/// +/// @anchor TSortOrder_backward_compatibility +/// @note +/// Many functions that use `TSortOrder` as argument used to take `TString` +/// (the only allowed sort order was "ascending" and user didn't have to specify it). +/// @note +/// This class is designed to provide backward compatibility for such code and therefore +/// objects of this class can be constructed and assigned from TString-like objects only. +/// +/// @see NYT::TSortOperationSpec +class TSortColumn +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TSortColumn; + /// @endcond + + /// Column name + FLUENT_FIELD_ENCAPSULATED(TString, Name); + + /// Sort order + FLUENT_FIELD_DEFAULT_ENCAPSULATED(ESortOrder, SortOrder, ESortOrder::SO_ASCENDING); + + /// + /// @{ + /// + /// @brief Construct object from name and sort order + /// + /// Constructors are intentionally implicit so `TSortColumn` can be compatible with old code. + /// @ref TSortOrder_backward_compatibility + TSortColumn(TStringBuf name = {}, ESortOrder sortOrder = ESortOrder::SO_ASCENDING); + TSortColumn(const TString& name, ESortOrder sortOrder = ESortOrder::SO_ASCENDING); + TSortColumn(const char* name, ESortOrder sortOrder = ESortOrder::SO_ASCENDING); + /// @} + + /// Check that sort order is ascending, throw exception otherwise. + const TSortColumn& EnsureAscending() const; + + /// @brief Convert sort to yson representation as YT API expects it. + TNode ToNode() const; + + /// @brief Comparison is default and checks both name and sort order. + bool operator == (const TSortColumn& rhs) const = default; + + /// + /// @{ + /// + /// @brief Assign object from column name, and set sort order to `ascending`. + /// + /// This is backward compatibility methods. + /// + /// @ref TSortOrder_backward_compatibility + TSortColumn& operator = (TStringBuf name); + TSortColumn& operator = (const TString& name); + TSortColumn& operator = (const char* name); + /// @} + + bool operator == (const TStringBuf rhsName) const; + bool operator != (const TStringBuf rhsName) const; + bool operator == (const TString& rhsName) const; + bool operator != (const TString& rhsName) const; + bool operator == (const char* rhsName) const; + bool operator != (const char* rhsName) const; + + // Intentionally implicit conversions. + operator TString() const; + operator TStringBuf() const; + operator std::string() const; + + Y_SAVELOAD_DEFINE(Name_, SortOrder_); +}; + +/// +/// @brief List of @ref TSortColumn +/// +/// Contains a bunch of helper methods such as constructing from single object. +class TSortColumns + : public TOneOrMany<TSortColumn, TSortColumns> +{ +public: + using TOneOrMany<TSortColumn, TSortColumns>::TOneOrMany; + + /// Construct empty list. + TSortColumns(); + + /// + /// @{ + /// + /// @brief Construct list of ascending sort order columns by their names. + /// + /// Required for backward compatibility. + /// + /// @ref TSortOrder_backward_compatibility + TSortColumns(const TVector<TString>& names); + TSortColumns(const TColumnNames& names); + /// @} + + + /// + /// @brief Implicit conversion to column list. + /// + /// If all columns has ascending sort order return list of their names. + /// Throw exception otherwise. + /// + /// Required for backward compatibility. + /// + /// @ref TSortOrder_backward_compatibility + operator TColumnNames() const; + + /// Make sure that all columns are of ascending sort order. + const TSortColumns& EnsureAscending() const; + + /// Get list of column names. + TVector<TString> GetNames() const; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Helper function to create new style type from old style one. +NTi::TTypePtr ToTypeV3(EValueType type, bool required); + +/// +/// @brief Single column description +/// +/// Each field describing column has setter and getter. +/// +/// Example reading field: +/// ``` +/// ... columnSchema.Name() ... +/// ``` +/// +/// Example setting field: +/// ``` +/// columnSchema.Name("my-column").Type(VT_INT64); // set name and type +/// ``` +/// +/// @ref https://yt.yandex-team.ru/docs/description/storage/static_schema +class TColumnSchema +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TColumnSchema; + /// @endcond + + /// + /// @brief Construct empty column schemas + /// + /// @note + /// Such schema cannot be used in schema as it it doesn't have name. + TColumnSchema(); + + /// + /// @{ + /// + /// @brief Copy and move constructors are default. + TColumnSchema(const TColumnSchema&) = default; + TColumnSchema& operator=(const TColumnSchema&) = default; + /// @} + + + FLUENT_FIELD_ENCAPSULATED(TString, Name); + + /// + /// @brief Functions to work with type in old manner. + /// + /// @deprecated New code is recommended to work with types using @ref NTi::TTypePtr from type_info library. + TColumnSchema& Type(EValueType type) &; + TColumnSchema Type(EValueType type) &&; + EValueType Type() const; + + /// @brief Set and get column type. + /// @{ + TColumnSchema& Type(const NTi::TTypePtr& type) &; + TColumnSchema Type(const NTi::TTypePtr& type) &&; + + TColumnSchema& TypeV3(const NTi::TTypePtr& type) &; + TColumnSchema TypeV3(const NTi::TTypePtr& type) &&; + NTi::TTypePtr TypeV3() const; + /// @} + + /// + /// @brief Raw yson representation of column type + /// @deprecated Prefer to use `TypeV3` methods. + FLUENT_FIELD_OPTION_ENCAPSULATED(TNode, RawTypeV3); + + /// Column sort order + FLUENT_FIELD_OPTION_ENCAPSULATED(ESortOrder, SortOrder); + + /// + /// @brief Lock group name + /// + /// @ref https://yt.yandex-team.ru/docs/description/dynamic_tables/sorted_dynamic_tables#blokirovka-stroki + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Lock); + + /// Expression defining column value + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Expression); + + /// Aggregating function name + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Aggregate); + + /// + /// @brief Storage group name + /// + /// @ref https://yt.yandex-team.ru/docs/description/storage/static_schema + FLUENT_FIELD_OPTION_ENCAPSULATED(TString, Group); + + /// + /// @brief Column requiredness. + /// + /// Required columns doesn't accept NULL values. + /// Usually if column is required it means that it has Optional<...> type + bool Required() const; + + /// + /// @{ + /// + /// @brief Set type in old-style manner + TColumnSchema& Type(EValueType type, bool required) &; + TColumnSchema Type(EValueType type, bool required) &&; + /// @} + +private: + friend void Deserialize(TColumnSchema& columnSchema, const TNode& node); + NTi::TTypePtr TypeV3_; + bool Required_ = false; +}; + +/// Equality check checks all fields of column schema. +bool operator==(const TColumnSchema& lhs, const TColumnSchema& rhs); + +/// +/// @brief Description of table schema +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/static_schema +class TTableSchema +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TTableSchema; + /// @endcond + + /// Column schema + FLUENT_VECTOR_FIELD_ENCAPSULATED(TColumnSchema, Column); + + /// + /// @brief Strictness of the schema + /// + /// Strict schemas are not allowed to have columns not described in schema. + /// Nonstrict schemas are allowed to have such columns, all such missing columns are assumed to have + FLUENT_FIELD_DEFAULT_ENCAPSULATED(bool, Strict, true); + + /// + /// @brief Whether keys are unique + /// + /// This flag can be set only for schemas that have sorted columns. + /// If flag is set table cannot have multiple rows with same key. + FLUENT_FIELD_DEFAULT_ENCAPSULATED(bool, UniqueKeys, false); + + /// Get modifiable column list + TVector<TColumnSchema>& MutableColumns(); + + /// Check if schema has any described column + [[nodiscard]] bool Empty() const; + + /// Add column + TTableSchema& AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&; + TTableSchema AddColumn(const TString& name, const NTi::TTypePtr& type, ESortOrder sortOrder) &&; + + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&; + TTableSchema& AddColumn(const TString& name, const NTi::TTypePtr& type) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, const NTi::TTypePtr&, ESortOrder)&; + TTableSchema AddColumn(const TString& name, const NTi::TTypePtr& type) &&; + + /// Add optional column of specified type + TTableSchema& AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&; + TTableSchema AddColumn(const TString& name, EValueType type, ESortOrder sortOrder) &&; + + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&; + TTableSchema& AddColumn(const TString& name, EValueType type) &; + /// @copydoc NYT::TTableSchema::AddColumn(const TString&, EValueType, ESortOrder)&; + TTableSchema AddColumn(const TString& name, EValueType type) &&; + + /// + /// @brief Make table schema sorted by specified columns + /// + /// Resets old key columns if any + TTableSchema& SortBy(const TSortColumns& columns) &; + + /// @copydoc NYT::TTableSchema::SortBy(const TSortColumns&)&; + TTableSchema SortBy(const TSortColumns& columns) &&; + + /// Get yson description of table schema + [[nodiscard]] TNode ToNode() const; + + /// Parse schema from yson node + static NYT::TTableSchema FromNode(const TNode& node); + + friend void Deserialize(TTableSchema& tableSchema, const TNode& node); +}; + +/// Check for equality of all columns and all schema attributes +bool operator==(const TTableSchema& lhs, const TTableSchema& rhs); + +/// Create table schema by protobuf message descriptor +TTableSchema CreateTableSchema( + const ::google::protobuf::Descriptor& messageDescriptor, + const TSortColumns& sortColumns = TSortColumns(), + bool keepFieldsWithoutExtension = true); + +/// Create table schema by protobuf message type +template <class TProtoType, typename = std::enable_if_t<std::is_base_of_v<::google::protobuf::Message, TProtoType>>> +inline TTableSchema CreateTableSchema( + const TSortColumns& sortColumns = TSortColumns(), + bool keepFieldsWithoutExtension = true) +{ + static_assert( + std::is_base_of_v<::google::protobuf::Message, TProtoType>, + "Template argument must be derived from ::google::protobuf::Message"); + + return CreateTableSchema( + *TProtoType::descriptor(), + sortColumns, + keepFieldsWithoutExtension); +} + +/// +/// @brief Create strict table schema from `struct` type. +/// +/// Names and types of columns are taken from struct member names and types. +/// `Strict` flag is set to true, all other attribute of schema and columns +/// are left with default values +TTableSchema CreateTableSchema(NTi::TTypePtr type); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Enumeration describing comparison operation used in key bound. +/// +/// ERelation is a part of @ref NYT::TKeyBound that can be used as +/// lower or upper key limit in @ref TReadLimit. +/// +/// Relations `Less` and `LessOrEqual` are for upper limit and +/// relations `Greater` and `GreaterOrEqual` are for lower limit. +/// +/// It is a error to use relation in the limit of wrong kind. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +enum class ERelation +{ + /// + /// @brief Relation "less" + /// + /// Specifies range of keys that are before specified key. + /// Can only be used in upper limit. + Less /* "<" */, + + /// + /// @brief Relation "less or equal" + /// + /// Specifies range of keys that are before or equal specified key. + /// Can only be used in upper limit. + LessOrEqual /* "<=" */, + + /// + /// @brief Relation "greater" + /// + /// Specifies range of keys that are after specified key. + /// Can only be used in lower limit. + Greater /* ">" */, + + /// + /// @brief Relation "greater or equal" + /// + /// Specifies range of keys that are after or equal than specified key. + /// Can only be used in lower limit. + GreaterOrEqual /* ">=" */, +}; + +/// +/// @brief Key with relation specifying interval of keys in lower or upper limit of @ref NYT::TReadRange +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TKeyBound +{ + /// @cond Doxygen_Suppress + using TSelf = TKeyBound; + + explicit TKeyBound(ERelation relation = ERelation::Less, TKey key = TKey{}); + + FLUENT_FIELD_DEFAULT_ENCAPSULATED(ERelation, Relation, ERelation::Less); + FLUENT_FIELD_DEFAULT_ENCAPSULATED(TKey, Key, TKey{}); + /// @endcond +}; + +/// +/// @brief Description of the read limit. +/// +/// It is actually a variant and must store exactly one field. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TReadLimit +{ + /// @cond Doxygen_Suppress + using TSelf = TReadLimit; + /// @endcond + + /// + /// @brief KeyBound specifies table key and whether to include it + /// + /// It can be used in lower or upper limit when reading tables. + FLUENT_FIELD_OPTION(TKeyBound, KeyBound); + + /// + /// @brief Table key + /// + /// It can be used in exact, lower or upper limit when reading tables. + FLUENT_FIELD_OPTION(TKey, Key); + + /// + /// @brief Row index + /// + /// It can be used in exact, lower or upper limit when reading tables. + FLUENT_FIELD_OPTION(i64, RowIndex); + + /// + /// @brief File offset + /// + /// It can be used in lower or upper limit when reading files. + FLUENT_FIELD_OPTION(i64, Offset); + + /// + /// @brief Tablet index + /// + /// It can be used in lower or upper limit in dynamic table operations + FLUENT_FIELD_OPTION(i64, TabletIndex); +}; + +/// +/// @brief Range of a table or a file +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TReadRange +{ + using TSelf = TReadRange; + + /// + /// @brief Lower limit of the range + /// + /// It is usually inclusive (except when @ref NYT::TKeyBound with relation @ref NYT::ERelation::Greater is used). + FLUENT_FIELD(TReadLimit, LowerLimit); + + /// + /// @brief Lower limit of the range + /// + /// It is usually exclusive (except when @ref NYT::TKeyBound with relation @ref NYT::ERelation::LessOrEqual is used). + FLUENT_FIELD(TReadLimit, UpperLimit); + + /// Exact key or row index. + FLUENT_FIELD(TReadLimit, Exact); + + /// Create read range from row indexes. + static TReadRange FromRowIndices(i64 lowerLimit, i64 upperLimit) + { + return TReadRange() + .LowerLimit(TReadLimit().RowIndex(lowerLimit)) + .UpperLimit(TReadLimit().RowIndex(upperLimit)); + } + + /// Create read range from keys. + static TReadRange FromKeys(const TKey& lowerKeyInclusive, const TKey& upperKeyExclusive) + { + return TReadRange() + .LowerLimit(TReadLimit().Key(lowerKeyInclusive)) + .UpperLimit(TReadLimit().Key(upperKeyExclusive)); + } +}; + +/// +/// @brief Path with additional attributes. +/// +/// Allows to specify additional attributes for path used in some operations. +/// +/// @see https://yt.yandex-team.ru/docs/description/common/ypath#rich_ypath +struct TRichYPath +{ + /// @cond Doxygen_Suppress + using TSelf = TRichYPath; + /// @endcond + + /// Path itself. + FLUENT_FIELD(TYPath, Path); + + /// Specifies that path should be appended not overwritten + FLUENT_FIELD_OPTION(bool, Append); + + /// @deprecated Deprecated attribute. + FLUENT_FIELD_OPTION(bool, PartiallySorted); + + /// Specifies that path is expected to be sorted by these columns. + FLUENT_FIELD(TSortColumns, SortedBy); + + /// Add range to read. + TRichYPath& AddRange(TReadRange range) + { + if (!Ranges_) { + Ranges_.ConstructInPlace(); + } + Ranges_->push_back(std::move(range)); + return *this; + } + + TRichYPath& ResetRanges() + { + Ranges_.Clear(); + return *this; + } + + /// + /// @{ + /// + /// Return ranges to read. + /// + /// NOTE: Nothing (in TMaybe) and empty TVector are different ranges. + /// Nothing represents universal range (reader reads all table rows). + /// Empty TVector represents empty range (reader returns empty set of rows). + const TMaybe<TVector<TReadRange>>& GetRanges() const + { + return Ranges_; + } + + TMaybe<TVector<TReadRange>>& MutableRanges() + { + return Ranges_; + } + + /// + /// @{ + /// + /// Get range view, that is convenient way to iterate through all ranges. + TArrayRef<TReadRange> MutableRangesView() + { + if (Ranges_.Defined()) { + return TArrayRef(Ranges_->data(), Ranges_->size()); + } else { + return {}; + } + } + + TArrayRef<const TReadRange> GetRangesView() const + { + if (Ranges_.Defined()) { + return TArrayRef(Ranges_->data(), Ranges_->size()); + } else { + return {}; + } + } + /// @} + + /// @{ + /// + /// Get range by index. + const TReadRange& GetRange(ssize_t i) const + { + return Ranges_.GetRef()[i]; + } + + TReadRange& MutableRange(ssize_t i) + { + return Ranges_.GetRef()[i]; + } + /// @} + + /// + /// @brief Specifies columns that should be read. + /// + /// If it's set to Nothing then all columns will be read. + /// If empty TColumnNames is specified then each read row will be empty. + FLUENT_FIELD_OPTION(TColumnNames, Columns); + + FLUENT_FIELD_OPTION(bool, Teleport); + FLUENT_FIELD_OPTION(bool, Primary); + FLUENT_FIELD_OPTION(bool, Foreign); + FLUENT_FIELD_OPTION(i64, RowCountLimit); + + FLUENT_FIELD_OPTION(TString, FileName); + + /// Specifies original path to be shown in Web UI + FLUENT_FIELD_OPTION(TYPath, OriginalPath); + + /// + /// @brief Specifies that this path points to executable file + /// + /// Used in operation specs. + FLUENT_FIELD_OPTION(bool, Executable); + + /// + /// @brief Specify format to use when loading table. + /// + /// Used in operation specs. + FLUENT_FIELD_OPTION(TNode, Format); + + /// @brief Specifies table schema that will be set on the path + FLUENT_FIELD_OPTION(TTableSchema, Schema); + + /// Specifies compression codec that will be set on the path + FLUENT_FIELD_OPTION(TString, CompressionCodec); + + /// Specifies erasure codec that will be set on the path + FLUENT_FIELD_OPTION(EErasureCodecAttr, ErasureCodec); + + /// Specifies schema modification that will be set on the path + FLUENT_FIELD_OPTION(ESchemaModificationAttr, SchemaModification); + + /// Specifies optimize_for attribute that will be set on the path + FLUENT_FIELD_OPTION(EOptimizeForAttr, OptimizeFor); + + /// + /// @brief Do not put file used in operation into node cache + /// + /// If BypassArtifactCache == true, file will be loaded into the job's sandbox bypassing the cache on the YT node. + /// It helps jobs that use tmpfs to start faster, + /// because files will be loaded into tmpfs directly bypassing disk cache + FLUENT_FIELD_OPTION(bool, BypassArtifactCache); + + /// + /// @brief Timestamp of dynamic table. + /// + /// NOTE: it is _not_ unix timestamp + /// (instead it's transaction timestamp, that is more complex structure). + FLUENT_FIELD_OPTION(i64, Timestamp); + + /// + /// @brief Specify transaction that should be used to access this path. + /// + /// Allows to start cross-transactional operations. + FLUENT_FIELD_OPTION(TTransactionId, TransactionId); + + using TRenameColumnsDescriptor = THashMap<TString, TString>; + + /// Specifies columnar mapping which will be applied to columns before transfer to job. + FLUENT_FIELD_OPTION(TRenameColumnsDescriptor, RenameColumns); + + /// Create empty path with no attributes + TRichYPath() + { } + + /// + /// @{ + /// + /// @brief Create path from string + TRichYPath(const char* path) + : Path_(path) + { } + + TRichYPath(const TYPath& path) + : Path_(path) + { } + /// @} + +private: + TMaybe<TVector<TReadRange>> Ranges_; +}; + +/// +/// @ref Create copy of @ref NYT::TRichYPath with schema derived from proto message. +/// +/// +template <typename TProtoType> +TRichYPath WithSchema(const TRichYPath& path, const TSortColumns& sortBy = TSortColumns()) +{ + static_assert(std::is_base_of_v<::google::protobuf::Message, TProtoType>, "TProtoType must be Protobuf message"); + + auto schemedPath = path; + if (!schemedPath.Schema_) { + schemedPath.Schema(CreateTableSchema<TProtoType>(sortBy)); + } + return schemedPath; +} + +/// +/// @brief Create copy of @ref NYT::TRichYPath with schema derived from TRowType if possible. +/// +/// If TRowType is protobuf message schema is derived from it and set to returned path. +/// Otherwise schema of original path is left unchanged (and probably unset). +template <typename TRowType> +TRichYPath MaybeWithSchema(const TRichYPath& path, const TSortColumns& sortBy = TSortColumns()) +{ + if constexpr (std::is_base_of_v<::google::protobuf::Message, TRowType>) { + return WithSchema<TRowType>(path, sortBy); + } else { + return path; + } +} + +/// +/// @brief Get the list of ranges related to path in compatibility mode. +/// +/// - If path is missing ranges, empty list is returned. +/// - If path has associated range list and the list is not empty, function returns this list. +/// - If path has associated range list and this list is empty, exception is thrown. +/// +/// Before YT-17683 RichYPath didn't support empty range list and empty range actualy meant universal range. +/// This function emulates this old behavior. +/// +/// @see https://st.yandex-team.ru/YT-17683 +const TVector<TReadRange>& GetRangesCompat(const TRichYPath& path); + +//////////////////////////////////////////////////////////////////////////////// + +/// Statistics about table columns. +struct TTableColumnarStatistics +{ + /// Total data weight for all chunks for each of requested columns. + THashMap<TString, i64> ColumnDataWeight; + + /// Total weight of all old chunks that don't keep columnar statistics. + i64 LegacyChunksDataWeight = 0; + + /// Timestamps total weight (only for dynamic tables). + TMaybe<i64> TimestampTotalWeight; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Description of a partition. +struct TMultiTablePartition +{ + struct TStatistics + { + i64 ChunkCount = 0; + i64 DataWeight = 0; + i64 RowCount = 0; + }; + + /// Ranges of input tables for this partition. + TVector<TRichYPath> TableRanges; + + /// Aggregate statistics of all the table ranges in the partition. + TStatistics AggregateStatistics; +}; + +/// Table partitions from GetTablePartitions command. +struct TMultiTablePartitions +{ + /// Disjoint partitions into which the input tables were divided. + TVector<TMultiTablePartition> Partitions; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Contains information about tablet +/// +/// @see NYT::IClient::GetTabletInfos +struct TTabletInfo +{ + /// + /// @brief Indicates the total number of rows added to the tablet (including trimmed ones). + /// + /// Currently only provided for ordered tablets. + i64 TotalRowCount = 0; + + /// + /// @brief Contains the number of front rows that are trimmed and are not guaranteed to be accessible. + /// + /// Only makes sense for ordered tablet. + i64 TrimmedRowCount = 0; + + /// + /// @brief Tablet cell barrier timestamp, which lags behind the current timestamp + /// + /// It is guaranteed that all transactions with commit timestamp not exceeding the barrier are fully committed; + /// e.g. all their added rows are visible (and are included in @ref NYT::TTabletInfo::TotalRowCount). + /// Mostly makes sense for ordered tablets. + ui64 BarrierTimestamp; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// List of attributes to retrieve in operations like @ref NYT::ICypressClient::Get +struct TAttributeFilter +{ + /// @cond Doxygen_Suppress + using TSelf = TAttributeFilter; + /// @endcond + + /// List of attributes. + FLUENT_VECTOR_FIELD(TString, Attribute); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Check if none of the fields of @ref NYT::TReadLimit is set. +/// +/// @return true if any field of readLimit is set and false otherwise. +bool IsTrivial(const TReadLimit& readLimit); + +/// Convert yson node type to table schema type +EValueType NodeTypeToValueType(TNode::EType nodeType); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Enumeration for specifying how reading from master is performed. +/// +/// Used in operations like NYT::ICypressClient::Get +enum class EMasterReadKind : int +{ + /// + /// @brief Reading from leader. + /// + /// Should almost never be used since it's expensive and for regular uses has no difference from + /// "follower" read. + Leader /* "leader" */, + + /// @brief Reading from master follower (default). + Follower /* "follower" */, + Cache /* "cache" */, + MasterCache /* "master_cache" */, +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @cond Doxygen_Suppress +namespace NDetail { + +// MUST NOT BE USED BY CLIENTS +// TODO: we should use default GENERATE_ENUM_SERIALIZATION +TString ToString(EValueType type); + +} // namespace NDetail +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/common_ut.cpp b/yt/cpp/mapreduce/interface/common_ut.cpp new file mode 100644 index 0000000000..3f19433816 --- /dev/null +++ b/yt/cpp/mapreduce/interface/common_ut.cpp @@ -0,0 +1,303 @@ +#include "common_ut.h" + +#include "fluent.h" + +#include <yt/cpp/mapreduce/interface/common.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/yson/node/node_builder.h> + +#include <util/generic/xrange.h> + +using namespace NYT; + +template <class T> +TString SaveToString(const T& obj) +{ + TString s; + TStringOutput out(s); + ::Save(&out, obj); + return s; +} + +template <class T> +T LoadFromString(TStringBuf s) +{ + TMemoryInput in(s); + T obj; + ::Load(&in, obj); + return obj; +} + +template <class T> +T SaveLoad(const T& obj) +{ + return LoadFromString<T>(SaveToString(obj)); +} + +Y_UNIT_TEST_SUITE(Common) +{ + Y_UNIT_TEST(SortColumnsLegacy) + { + TSortColumns keys1("a", "b"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b"})); + + keys1.Add("c", "d"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"})); + + auto keys2 = TSortColumns(keys1).Add("e", "f"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"})); + UNIT_ASSERT((keys2.Parts_ == TSortColumns{"a", "b", "c", "d", "e", "f"})); + + auto keys3 = TSortColumns(keys1).Add("e").Add("f").Add("g"); + UNIT_ASSERT((keys1.Parts_ == TSortColumns{"a", "b", "c", "d"})); + UNIT_ASSERT((keys3.Parts_ == TSortColumns{"a", "b", "c", "d", "e", "f", "g"})); + } + + Y_UNIT_TEST(SortColumn) + { + auto ascending = TSortColumn("a"); + UNIT_ASSERT_VALUES_EQUAL(ascending.Name(), "a"); + UNIT_ASSERT_VALUES_EQUAL(ascending.SortOrder(), ESortOrder::SO_ASCENDING); + UNIT_ASSERT_VALUES_EQUAL(ascending, TSortColumn("a", ESortOrder::SO_ASCENDING)); + UNIT_ASSERT_VALUES_UNEQUAL(ascending, TSortColumn("a", ESortOrder::SO_DESCENDING)); + + UNIT_ASSERT_NO_EXCEPTION(ascending.EnsureAscending()); + UNIT_ASSERT_VALUES_EQUAL(static_cast<TString>(ascending), "a"); + UNIT_ASSERT_VALUES_EQUAL(ascending, "a"); + + auto another = ascending; + UNIT_ASSERT_NO_EXCEPTION(another = "another"); + UNIT_ASSERT_VALUES_EQUAL(another.Name(), "another"); + UNIT_ASSERT_VALUES_EQUAL(another.SortOrder(), ESortOrder::SO_ASCENDING); + UNIT_ASSERT_VALUES_EQUAL(another, TSortColumn("another", ESortOrder::SO_ASCENDING)); + UNIT_ASSERT_VALUES_UNEQUAL(another, TSortColumn("another", ESortOrder::SO_DESCENDING)); + + auto ascendingNode = BuildYsonNodeFluently().Value(ascending); + UNIT_ASSERT_VALUES_EQUAL(ascendingNode, TNode("a")); + + UNIT_ASSERT_VALUES_EQUAL(SaveLoad(ascending), ascending); + UNIT_ASSERT_VALUES_UNEQUAL(SaveToString(ascending), SaveToString(TString("a"))); + + auto descending = TSortColumn("a", ESortOrder::SO_DESCENDING); + UNIT_ASSERT_VALUES_EQUAL(descending.Name(), "a"); + UNIT_ASSERT_VALUES_EQUAL(descending.SortOrder(), ESortOrder::SO_DESCENDING); + UNIT_ASSERT_VALUES_EQUAL(descending, TSortColumn("a", ESortOrder::SO_DESCENDING)); + UNIT_ASSERT_VALUES_UNEQUAL(descending, TSortColumn("a", ESortOrder::SO_ASCENDING)); + + UNIT_ASSERT_EXCEPTION(descending.EnsureAscending(), yexception); + UNIT_ASSERT_EXCEPTION(static_cast<TString>(descending), yexception); + UNIT_ASSERT_EXCEPTION(descending == "a", yexception); + UNIT_ASSERT_EXCEPTION(descending = "a", yexception); + + auto descendingNode = BuildYsonNodeFluently().Value(descending); + UNIT_ASSERT_VALUES_EQUAL(descendingNode, TNode()("name", "a")("sort_order", "descending")); + + UNIT_ASSERT_VALUES_EQUAL(SaveLoad(descending), descending); + UNIT_ASSERT_VALUES_UNEQUAL(SaveToString(descending), SaveToString("a")); + + UNIT_ASSERT_VALUES_EQUAL(ToString(TSortColumn("blah")), "blah"); + UNIT_ASSERT_VALUES_EQUAL(ToString(TSortColumn("blah", ESortOrder::SO_DESCENDING)), "{\"name\"=\"blah\";\"sort_order\"=\"descending\"}"); + } + + Y_UNIT_TEST(SortColumns) + { + TSortColumns ascending("a", "b"); + UNIT_ASSERT(ascending.Parts_ == (TSortColumns{"a", "b"})); + UNIT_ASSERT_NO_EXCEPTION(ascending.EnsureAscending()); + UNIT_ASSERT_VALUES_EQUAL(static_cast<TColumnNames>(ascending).Parts_, (TVector<TString>{"a", "b"})); + UNIT_ASSERT_VALUES_EQUAL(ascending.GetNames(), (TVector<TString>{"a", "b"})); + + auto mixed = ascending; + mixed.Add(TSortColumn("c", ESortOrder::SO_DESCENDING), "d"); + UNIT_ASSERT((mixed.Parts_ != TVector<TSortColumn>{"a", "b", "c", "d"})); + UNIT_ASSERT((mixed.Parts_ == TVector<TSortColumn>{"a", "b", TSortColumn("c", ESortOrder::SO_DESCENDING), "d"})); + UNIT_ASSERT_VALUES_EQUAL(mixed.GetNames(), (TVector<TString>{"a", "b", "c", "d"})); + UNIT_ASSERT_EXCEPTION(mixed.EnsureAscending(), yexception); + UNIT_ASSERT_EXCEPTION(static_cast<TColumnNames>(mixed), yexception); + } + + Y_UNIT_TEST(KeyBound) + { + auto keyBound = TKeyBound(ERelation::Greater, TKey(7, "a", TNode()("x", "y"))); + UNIT_ASSERT_VALUES_EQUAL(keyBound.Relation(), ERelation::Greater); + UNIT_ASSERT_EQUAL(keyBound.Key(), TKey(7, "a", TNode()("x", "y"))); + + auto keyBound1 = TKeyBound().Relation(ERelation::Greater).Key(TKey(7, "a", TNode()("x", "y"))); + auto expectedNode = TNode() + .Add(">") + .Add(TNode().Add(7).Add("a").Add(TNode()("x", "y"))); + + UNIT_ASSERT_VALUES_EQUAL(expectedNode, BuildYsonNodeFluently().Value(keyBound)); + UNIT_ASSERT_VALUES_EQUAL(expectedNode, BuildYsonNodeFluently().Value(keyBound1)); + + keyBound.Relation(ERelation::LessOrEqual); + keyBound.Key(TKey("A", 7)); + UNIT_ASSERT_VALUES_EQUAL(keyBound.Relation(), ERelation::LessOrEqual); + UNIT_ASSERT_EQUAL(keyBound.Key(), TKey("A", 7)); + + UNIT_ASSERT_VALUES_EQUAL( + BuildYsonNodeFluently().Value(keyBound), + TNode() + .Add("<=") + .Add(TNode().Add("A").Add(7))); + } + + Y_UNIT_TEST(TTableSchema) + { + TTableSchema schema; + schema + .AddColumn(TColumnSchema().Name("a").Type(EValueType::VT_STRING).SortOrder(SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("b").Type(EValueType::VT_UINT64)) + .AddColumn(TColumnSchema().Name("c").Type(EValueType::VT_INT64)); + auto checkSortBy = [](TTableSchema schema, const TVector<TString>& columns) { + auto initialSchema = schema; + schema.SortBy(columns); + for (auto i: xrange(columns.size())) { + UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].Name(), columns[i]); + UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].SortOrder(), ESortOrder::SO_ASCENDING); + } + for (auto i: xrange(columns.size(), (size_t)initialSchema.Columns().size())) { + UNIT_ASSERT_VALUES_EQUAL(schema.Columns()[i].SortOrder(), Nothing()); + } + UNIT_ASSERT_VALUES_EQUAL(initialSchema.Columns().size(), schema.Columns().size()); + return schema; + }; + auto newSchema = checkSortBy(schema, {"b"}); + UNIT_ASSERT_VALUES_EQUAL(newSchema.Columns()[1].Name(), TString("a")); + UNIT_ASSERT_VALUES_EQUAL(newSchema.Columns()[2].Name(), TString("c")); + checkSortBy(schema, {"b", "c"}); + checkSortBy(schema, {"c", "a"}); + UNIT_ASSERT_EXCEPTION(checkSortBy(schema, {"b", "b"}), yexception); + UNIT_ASSERT_EXCEPTION(checkSortBy(schema, {"a", "junk"}), yexception); + } + + Y_UNIT_TEST(TColumnSchema_TypeV3) + { + { + auto column = TColumnSchema().Type(NTi::Interval()); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), true); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_INTERVAL); + } + { + auto column = TColumnSchema().Type(NTi::Optional(NTi::Date())); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), false); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_DATE); + } + { + auto column = TColumnSchema().Type(NTi::Null()); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), false); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_NULL); + } + { + auto column = TColumnSchema().Type(NTi::Optional(NTi::Null())); + UNIT_ASSERT_VALUES_EQUAL(column.Required(), false); + UNIT_ASSERT_VALUES_EQUAL(column.Type(), VT_ANY); + } + } + + Y_UNIT_TEST(ToTypeV3) + { + UNIT_ASSERT_VALUES_EQUAL(*ToTypeV3(VT_INT32, true), *NTi::Int32()); + UNIT_ASSERT_VALUES_EQUAL(*ToTypeV3(VT_UTF8, false), *NTi::Optional(NTi::Utf8())); + } + + Y_UNIT_TEST(DeserializeColumn) + { + auto deserialize = [] (TStringBuf yson) { + auto node = NodeFromYsonString(yson); + TColumnSchema column; + Deserialize(column, node); + return column; + }; + + auto column = deserialize("{name=foo; type=int64; required=%false}"); + UNIT_ASSERT_VALUES_EQUAL(column.Name(), "foo"); + UNIT_ASSERT_VALUES_EQUAL(*column.TypeV3(), *NTi::Optional(NTi::Int64())); + + column = deserialize("{name=bar; type=utf8; required=%true; type_v3=utf8}"); + UNIT_ASSERT_VALUES_EQUAL(column.Name(), "bar"); + UNIT_ASSERT_VALUES_EQUAL(*column.TypeV3(), *NTi::Utf8()); + } + + Y_UNIT_TEST(ColumnSchemaEquality) + { + auto base = TColumnSchema() + .Name("col") + .TypeV3(NTi::Optional(NTi::List(NTi::String()))) + .SortOrder(ESortOrder::SO_ASCENDING) + .Lock("lock") + .Expression("x + 12") + .Aggregate("sum") + .Group("group"); + + auto other = base; + ASSERT_SERIALIZABLES_EQUAL(other, base); + other.Name("other"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.TypeV3(NTi::List(NTi::String())); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.ResetSortOrder(); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.Lock("lock1"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.Expression("x + 13"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.ResetAggregate(); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + + other = base; + other.Group("group1"); + ASSERT_SERIALIZABLES_UNEQUAL(other, base); + } + + Y_UNIT_TEST(TableSchemaEquality) + { + auto col1 = TColumnSchema() + .Name("col1") + .TypeV3(NTi::Optional(NTi::List(NTi::String()))) + .SortOrder(ESortOrder::SO_ASCENDING); + + auto col2 = TColumnSchema() + .Name("col2") + .TypeV3(NTi::Uint32()); + + auto schema = TTableSchema() + .AddColumn(col1) + .AddColumn(col2) + .Strict(true) + .UniqueKeys(true); + + auto other = schema; + ASSERT_SERIALIZABLES_EQUAL(other, schema); + + other.Strict(false); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + + other = schema; + other.MutableColumns()[0].TypeV3(NTi::List(NTi::String())); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + + other = schema; + other.MutableColumns().push_back(col1); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + + other = schema; + other.UniqueKeys(false); + ASSERT_SERIALIZABLES_UNEQUAL(other, schema); + } +} diff --git a/yt/cpp/mapreduce/interface/common_ut.h b/yt/cpp/mapreduce/interface/common_ut.h new file mode 100644 index 0000000000..6f70f09bee --- /dev/null +++ b/yt/cpp/mapreduce/interface/common_ut.h @@ -0,0 +1 @@ +#pragma once diff --git a/yt/cpp/mapreduce/interface/config.cpp b/yt/cpp/mapreduce/interface/config.cpp new file mode 100644 index 0000000000..b474dc0844 --- /dev/null +++ b/yt/cpp/mapreduce/interface/config.cpp @@ -0,0 +1,321 @@ +#include "config.h" + +#include "operation.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/svnversion/svnversion.h> + +#include <library/cpp/yson/node/node_builder.h> +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/yson/json/yson2json_adapter.h> + +#include <util/string/strip.h> +#include <util/folder/dirut.h> +#include <util/folder/path.h> +#include <util/stream/file.h> +#include <util/generic/singleton.h> +#include <util/string/builder.h> +#include <util/string/cast.h> +#include <util/string/type.h> +#include <util/system/hostname.h> +#include <util/system/user.h> +#include <util/system/env.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +bool TConfig::GetBool(const char* var, bool defaultValue) +{ + TString val = GetEnv(var, ""); + if (val.empty()) { + return defaultValue; + } + return IsTrue(val); +} + +int TConfig::GetInt(const char* var, int defaultValue) +{ + int result = 0; + TString val = GetEnv(var, ""); + if (val.empty()) { + return defaultValue; + } + try { + result = FromString<int>(val); + } catch (const yexception& e) { + ythrow yexception() << "Cannot parse " << var << '=' << val << " as integer: " << e.what(); + } + return result; +} + +TDuration TConfig::GetDuration(const char* var, TDuration defaultValue) +{ + return TDuration::Seconds(GetInt(var, defaultValue.Seconds())); +} + +EEncoding TConfig::GetEncoding(const char* var) +{ + const TString encodingName = GetEnv(var, "identity"); + EEncoding encoding; + if (TryFromString(encodingName, encoding)) { + return encoding; + } else { + ythrow yexception() << var << ": encoding '" << encodingName << "' is not supported"; + } +} + + EUploadDeduplicationMode TConfig::GetUploadingDeduplicationMode( + const char* var, + EUploadDeduplicationMode defaultValue) +{ + const TString deduplicationMode = GetEnv(var, TEnumTraits<EUploadDeduplicationMode>::ToString(defaultValue)); + return TEnumTraits<EUploadDeduplicationMode>::FromString(deduplicationMode); +} + +void TConfig::ValidateToken(const TString& token) +{ + for (size_t i = 0; i < token.size(); ++i) { + ui8 ch = token[i]; + if (ch < 0x21 || ch > 0x7e) { + ythrow yexception() << "Incorrect token character '" << ch << "' at position " << i; + } + } +} + +TString TConfig::LoadTokenFromFile(const TString& tokenPath) +{ + TFsPath path(tokenPath); + return path.IsFile() ? Strip(TIFStream(path).ReadAll()) : TString(); +} + +TNode TConfig::LoadJsonSpec(const TString& strSpec) +{ + TNode spec; + TStringInput input(strSpec); + TNodeBuilder builder(&spec); + TYson2JsonCallbacksAdapter callbacks(&builder); + + Y_ENSURE(NJson::ReadJson(&input, &callbacks), "Cannot parse json spec: " << strSpec); + Y_ENSURE(spec.IsMap(), "Json spec is not a map"); + + return spec; +} + +TRichYPath TConfig::LoadApiFilePathOptions(const TString& ysonMap) +{ + TNode attributes; + try { + attributes = NodeFromYsonString(ysonMap); + } catch (const yexception& exc) { + ythrow yexception() << "Failed to parse YT_API_FILE_PATH_OPTIONS (it must be yson map): " << exc; + } + TNode pathNode = ""; + pathNode.Attributes() = attributes; + TRichYPath path; + Deserialize(path, pathNode); + return path; +} + +void TConfig::LoadToken() +{ + if (auto envToken = GetEnv("YT_TOKEN")) { + Token = envToken; + } else if (auto envToken = GetEnv("YT_SECURE_VAULT_YT_TOKEN")) { + // If this code runs inside an vanilla peration in YT + // it should not use regular environment variable `YT_TOKEN` + // because it would be visible in UI. + // Token should be passed via `secure_vault` parameter in operation spec. + Token = envToken; + } else if (auto tokenPath = GetEnv("YT_TOKEN_PATH")) { + Token = LoadTokenFromFile(tokenPath); + } else { + Token = LoadTokenFromFile(GetHomeDir() + "/.yt/token"); + } + ValidateToken(Token); +} + +void TConfig::LoadSpec() +{ + TString strSpec = GetEnv("YT_SPEC", "{}"); + Spec = LoadJsonSpec(strSpec); + + strSpec = GetEnv("YT_TABLE_WRITER", "{}"); + TableWriter = LoadJsonSpec(strSpec); +} + +void TConfig::LoadTimings() +{ + ConnectTimeout = GetDuration("YT_CONNECT_TIMEOUT", + TDuration::Seconds(10)); + + SocketTimeout = GetDuration("YT_SOCKET_TIMEOUT", + GetDuration("YT_SEND_RECEIVE_TIMEOUT", // common + TDuration::Seconds(60))); + + AddressCacheExpirationTimeout = TDuration::Minutes(15); + + CacheLockTimeoutPerGb = TDuration::MilliSeconds(1000.0 * 1_GB * 8 / 20_MB); // 20 Mbps = 20 MBps / 8. + + TxTimeout = GetDuration("YT_TX_TIMEOUT", + TDuration::Seconds(120)); + + PingTimeout = GetDuration("YT_PING_TIMEOUT", + TDuration::Seconds(5)); + + PingInterval = GetDuration("YT_PING_INTERVAL", + TDuration::Seconds(5)); + + WaitLockPollInterval = TDuration::Seconds(5); + + RetryInterval = GetDuration("YT_RETRY_INTERVAL", + TDuration::Seconds(3)); + + ChunkErrorsRetryInterval = GetDuration("YT_CHUNK_ERRORS_RETRY_INTERVAL", + TDuration::Seconds(60)); + + RateLimitExceededRetryInterval = GetDuration("YT_RATE_LIMIT_EXCEEDED_RETRY_INTERVAL", + TDuration::Seconds(60)); + + StartOperationRetryInterval = GetDuration("YT_START_OPERATION_RETRY_INTERVAL", + TDuration::Seconds(60)); + + HostListUpdateInterval = TDuration::Seconds(60); +} + +void TConfig::Reset() +{ + Hosts = GetEnv("YT_HOSTS", "hosts"); + Pool = GetEnv("YT_POOL"); + Prefix = GetEnv("YT_PREFIX"); + ApiVersion = GetEnv("YT_VERSION", "v3"); + LogLevel = GetEnv("YT_LOG_LEVEL", "error"); + + ContentEncoding = GetEncoding("YT_CONTENT_ENCODING"); + AcceptEncoding = GetEncoding("YT_ACCEPT_ENCODING"); + + GlobalTxId = GetEnv("YT_TRANSACTION", ""); + + UseAsyncTxPinger = false; + AsyncHttpClientThreads = 1; + AsyncTxPingerPoolThreads = 1; + + ForceIpV4 = GetBool("YT_FORCE_IPV4"); + ForceIpV6 = GetBool("YT_FORCE_IPV6"); + UseHosts = GetBool("YT_USE_HOSTS", true); + + LoadToken(); + LoadSpec(); + LoadTimings(); + + CacheUploadDeduplicationMode = GetUploadingDeduplicationMode("YT_UPLOAD_DEDUPLICATION", EUploadDeduplicationMode::Host); + + RetryCount = Max(GetInt("YT_RETRY_COUNT", 10), 1); + ReadRetryCount = Max(GetInt("YT_READ_RETRY_COUNT", 30), 1); + StartOperationRetryCount = Max(GetInt("YT_START_OPERATION_RETRY_COUNT", 30), 1); + + RemoteTempFilesDirectory = GetEnv("YT_FILE_STORAGE", + "//tmp/yt_wrapper/file_storage"); + RemoteTempTablesDirectory = GetEnv("YT_TEMP_TABLES_STORAGE", + "//tmp/yt_wrapper/table_storage"); + RemoteTempTablesDirectory = GetEnv("YT_TEMP_DIR", + RemoteTempTablesDirectory); + + InferTableSchema = false; + + UseClientProtobuf = GetBool("YT_USE_CLIENT_PROTOBUF", false); + NodeReaderFormat = ENodeReaderFormat::Auto; + ProtobufFormatWithDescriptors = true; + + MountSandboxInTmpfs = GetBool("YT_MOUNT_SANDBOX_IN_TMPFS"); + + ApiFilePathOptions = LoadApiFilePathOptions(GetEnv("YT_API_FILE_PATH_OPTIONS", "{}")); + + ConnectionPoolSize = GetInt("YT_CONNECTION_POOL_SIZE", 16); + + TraceHttpRequestsMode = FromString<ETraceHttpRequestsMode>(to_lower(GetEnv("YT_TRACE_HTTP_REQUESTS", "never"))); + + CommandsWithFraming = { + "read_table", + "get_table_columnar_statistics", + "get_job_input", + "concatenate", + "partition_tables", + }; +} + +TConfig::TConfig() +{ + Reset(); +} + +TConfigPtr TConfig::Get() +{ + struct TConfigHolder + { + TConfigHolder() + : Config(::MakeIntrusive<TConfig>()) + { } + + TConfigPtr Config; + }; + + return Singleton<TConfigHolder>()->Config; +} + +//////////////////////////////////////////////////////////////////////////////// + +TProcessState::TProcessState() +{ + try { + FqdnHostName = ::FQDNHostName(); + } catch (const yexception& e) { + try { + FqdnHostName = ::HostName(); + } catch (const yexception& e) { + ythrow yexception() << "Cannot get fqdn and host name: " << e.what(); + } + } + + try { + UserName = ::GetUsername(); + } catch (const yexception& e) { + ythrow yexception() << "Cannot get user name: " << e.what(); + } + + Pid = static_cast<int>(getpid()); + + if (!ClientVersion) { + ClientVersion = ::TStringBuilder() << "YT C++ native " << GetProgramCommitId(); + } +} + +static TString CensorString(TString input) +{ + static const TString prefix = "AQAD-"; + if (input.find(prefix) == TString::npos) { + return input; + } else { + return TString(input.size(), '*'); + } +} + +void TProcessState::SetCommandLine(int argc, const char* argv[]) +{ + for (int i = 0; i < argc; ++i) { + CommandLine.push_back(argv[i]); + CensoredCommandLine.push_back(CensorString(CommandLine.back())); + } +} + +TProcessState* TProcessState::Get() +{ + return Singleton<TProcessState>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/config.h b/yt/cpp/mapreduce/interface/config.h new file mode 100644 index 0000000000..c44ad25f1c --- /dev/null +++ b/yt/cpp/mapreduce/interface/config.h @@ -0,0 +1,228 @@ +#pragma once + +#include "fwd.h" +#include "common.h" +#include "node.h" + +#include <library/cpp/yt/misc/enum.h> + +#include <util/generic/maybe.h> +#include <util/generic/string.h> +#include <util/generic/hash_set.h> + +#include <util/datetime/base.h> + +namespace NYT { + +enum EEncoding : int +{ + E_IDENTITY /* "identity" */, + E_GZIP /* "gzip" */, + E_BROTLI /* "br" */, + E_Z_LZ4 /* "z-lz4" */, +}; + +enum class ENodeReaderFormat : int +{ + Yson, // Always use YSON format, + Skiff, // Always use Skiff format, throw exception if it's not possible (non-strict schema, dynamic table etc.) + Auto, // Use Skiff format if it's possible, YSON otherwise +}; + +enum class ETraceHttpRequestsMode +{ + // Never dump http requests. + Never /* "never" */, + // Dump failed http requests. + Error /* "error" */, + // Dump all http requests. + Always /* "always" */, +}; + +DEFINE_ENUM(EUploadDeduplicationMode, + // For each file only one process' thread from all possible hosts can upload it to the file cache at the same time. + // The others will wait for the uploading to finish and use already cached file. + ((Global) (0)) + + // For each file and each particular host only one process' thread can upload it to the file cache at the same time. + // The others will wait for the uploading to finish and use already cached file. + ((Host) (1)) + + // All processes' threads will upload a file to the cache concurrently. + ((Disabled) (2)) +); + +//////////////////////////////////////////////////////////////////////////////// + +struct TConfig + : public TThrRefBase +{ + TString Hosts; + TString Pool; + TString Token; + TString Prefix; + TString ApiVersion; + TString LogLevel; + + // Compression for data that is sent to YT cluster. + EEncoding ContentEncoding; + + // Compression for data that is read from YT cluster. + EEncoding AcceptEncoding; + + TString GlobalTxId; + + bool ForceIpV4; + bool ForceIpV6; + bool UseHosts; + + TDuration HostListUpdateInterval; + + TNode Spec; + TNode TableWriter; + + TDuration ConnectTimeout; + TDuration SocketTimeout; + TDuration AddressCacheExpirationTimeout; + TDuration TxTimeout; + TDuration PingTimeout; + TDuration PingInterval; + + bool UseAsyncTxPinger; + int AsyncHttpClientThreads; + int AsyncTxPingerPoolThreads; + + // How often should we poll for lock state + TDuration WaitLockPollInterval; + + TDuration RetryInterval; + TDuration ChunkErrorsRetryInterval; + + TDuration RateLimitExceededRetryInterval; + TDuration StartOperationRetryInterval; + + int RetryCount; + int ReadRetryCount; + int StartOperationRetryCount; + + /// @brief Period for checking status of running operation. + TDuration OperationTrackerPollPeriod = TDuration::Seconds(5); + + TString RemoteTempFilesDirectory; + TString RemoteTempTablesDirectory; + + // + // Infer schemas for nonexstent tables from typed rows (e.g. protobuf) + // when writing from operation or client writer. + // This options can be overriden in TOperationOptions and TTableWriterOptions. + bool InferTableSchema; + + bool UseClientProtobuf; + ENodeReaderFormat NodeReaderFormat; + bool ProtobufFormatWithDescriptors; + + int ConnectionPoolSize; + + /// Defines replication factor that is used for files that are uploaded to YT + /// to use them in operations. + int FileCacheReplicationFactor = 10; + + /// @brief Used when waiting for other process which uploads the same file to the file cache. + /// + /// If CacheUploadDeduplicationMode is not Disabled, current process can wait for some other + /// process which is uploading the same file. This value is proportional to the timeout of waiting, + /// actual timeout computes as follows: fileSizeGb * CacheLockTimeoutPerGb. + /// Default timeout assumes that host has uploading speed equal to 20 Mb/s. + /// If timeout was reached, the file will be uploaded by current process without any other waits. + TDuration CacheLockTimeoutPerGb; + + /// @brief Used to prevent concurrent uploading of the same file to the file cache. + /// NB: Each mode affects only users with the same mode enabled. + EUploadDeduplicationMode CacheUploadDeduplicationMode; + + bool MountSandboxInTmpfs; + + /// @brief Set upload options (e.g.) for files created by library. + /// + /// Path itself is always ignored but path options (e.g. `BypassArtifactCache`) are used when uploading system files: + /// cppbinary, job state, etc + TRichYPath ApiFilePathOptions; + + // Testing options, should never be used in user programs. + bool UseAbortableResponse = false; + bool EnableDebugMetrics = false; + + // + // There is optimization used with local YT that enables to skip binary upload and use real binary path. + // When EnableLocalModeOptimization is set to false this optimization is completely disabled. + bool EnableLocalModeOptimization = true; + + // + // If you want see stderr even if you jobs not failed set this true. + bool WriteStderrSuccessfulJobs = false; + + // + // This configuration is useful for debug. + // If set to ETraceHttpRequestsMode::Error library will dump all http error requests. + // If set to ETraceHttpRequestsMode::All library will dump all http requests. + // All tracing occurres as DEBUG level logging. + ETraceHttpRequestsMode TraceHttpRequestsMode = ETraceHttpRequestsMode::Never; + + TString SkynetApiHost; + + // Sets SO_PRIORITY option on the socket + TMaybe<int> SocketPriority; + + // Framing settings + // (cf. https://yt.yandex-team.ru/docs/description/proxy/http_proxy_reference#framing). + THashSet<TString> CommandsWithFraming; + + static bool GetBool(const char* var, bool defaultValue = false); + static int GetInt(const char* var, int defaultValue); + static TDuration GetDuration(const char* var, TDuration defaultValue); + static EEncoding GetEncoding(const char* var); + static EUploadDeduplicationMode GetUploadingDeduplicationMode( + const char* var, + EUploadDeduplicationMode defaultValue); + + static void ValidateToken(const TString& token); + static TString LoadTokenFromFile(const TString& tokenPath); + + static TNode LoadJsonSpec(const TString& strSpec); + + static TRichYPath LoadApiFilePathOptions(const TString& ysonMap); + + void LoadToken(); + void LoadSpec(); + void LoadTimings(); + + void Reset(); + + TConfig(); + + static TConfigPtr Get(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TProcessState +{ + TString FqdnHostName; + TString UserName; + TVector<TString> CommandLine; + + // Command line with everything that looks like tokens censored. + TVector<TString> CensoredCommandLine; + int Pid; + TString ClientVersion; + + TProcessState(); + + void SetCommandLine(int argc, const char* argv[]); + + static TProcessState* Get(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/config_ut.cpp b/yt/cpp/mapreduce/interface/config_ut.cpp new file mode 100644 index 0000000000..e49ba02108 --- /dev/null +++ b/yt/cpp/mapreduce/interface/config_ut.cpp @@ -0,0 +1,20 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <yt/cpp/mapreduce/interface/config.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(ConfigSuite) +{ + Y_UNIT_TEST(TestReset) { + // very limited test, checks only one config field + + auto origConfig = *TConfig::Get(); + TConfig::Get()->Reset(); + UNIT_ASSERT_VALUES_EQUAL(origConfig.Hosts, TConfig::Get()->Hosts); + + TConfig::Get()->Hosts = "hosts/fb867"; + TConfig::Get()->Reset(); + UNIT_ASSERT_VALUES_EQUAL(origConfig.Hosts, TConfig::Get()->Hosts); + } +} diff --git a/yt/cpp/mapreduce/interface/constants.h b/yt/cpp/mapreduce/interface/constants.h new file mode 100644 index 0000000000..4f70410814 --- /dev/null +++ b/yt/cpp/mapreduce/interface/constants.h @@ -0,0 +1,19 @@ +#pragma once + + +#include <util/system/defaults.h> + + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + + +// Maximum number of input tables for operation. +// If greater number of input tables are provided behaviour is undefined +// (it might work ok or it might fail or it might work very slowly). +constexpr size_t MaxInputTableCount = 1000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/cypress.cpp b/yt/cpp/mapreduce/interface/cypress.cpp new file mode 100644 index 0000000000..53686effd2 --- /dev/null +++ b/yt/cpp/mapreduce/interface/cypress.cpp @@ -0,0 +1,24 @@ +#include "cypress.h" + +#include "config.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +void ICypressClient::Concatenate( + const TVector<TYPath>& sourcePaths, + const TYPath& destinationPath, + const TConcatenateOptions& options) +{ + TVector<TRichYPath> richSourcePaths; + richSourcePaths.reserve(sourcePaths.size()); + for (const auto& path : sourcePaths) { + richSourcePaths.emplace_back(path); + } + Concatenate(richSourcePaths, destinationPath, options); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/cypress.h b/yt/cpp/mapreduce/interface/cypress.h new file mode 100644 index 0000000000..e05316ebc6 --- /dev/null +++ b/yt/cpp/mapreduce/interface/cypress.h @@ -0,0 +1,252 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/cypress.h +/// +/// Header containing interface to execute [Cypress](https://yt.yandex-team.ru/docs/description/common/cypress.html)-related commands. + +#include "fwd.h" + +#include "client_method_options.h" +#include "common.h" +#include "node.h" + +#include <util/generic/maybe.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Client interface to execute [Cypress](https://yt.yandex-team.ru/docs/description/common/cypress.html)-related commands. +class ICypressClient +{ +public: + virtual ~ICypressClient() = default; + + /// + /// @brief Create Cypress node of given type. + /// + /// @param path Path in Cypress to the new object. + /// @param type New node type. + /// @param options Optional parameters. + /// + /// @return Id of the created node. + /// + /// @note All but the last components must exist unless @ref NYT::TCreateOptions::Recursive is `true`. + /// + /// @note The node itself must not exist unless @ref NYT::TCreateOptions::IgnoreExisting or @ref NYT::TCreateOptions::Force are `true`. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#create) + virtual TNodeId Create( + const TYPath& path, + ENodeType type, + const TCreateOptions& options = TCreateOptions()) = 0; + + /// + /// @brief Create table with schema inferred from the template argument. + /// + /// @tparam TRowType type of C++ representation of the row to be stored in the table. + /// @param path Path in Cypress to the new table. + /// @param sortColumns List of columns to mark as sorted in schema. + /// @param options Optional parameters. + /// + /// @return Id of the created node. + /// + /// @note If "schema" is passed in `options.Attributes` it has priority over the deduced schema (the latter is ignored). + template <typename TRowType> + TNodeId CreateTable( + const TYPath& path, + const TSortColumns& sortColumns = TSortColumns(), + const TCreateOptions& options = TCreateOptions()); + + /// + /// @brief Remove Cypress node. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#remove) + virtual void Remove( + const TYPath& path, + const TRemoveOptions& options = TRemoveOptions()) = 0; + + /// + /// @brief Check if Cypress node exists. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#exists) + virtual bool Exists( + const TYPath& path, + const TExistsOptions& options = TExistsOptions()) = 0; + + /// + /// @brief Get Cypress node contents. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get) + virtual TNode Get( + const TYPath& path, + const TGetOptions& options = TGetOptions()) = 0; + + /// + /// @brief Set Cypress node contents. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#set) + virtual void Set( + const TYPath& path, + const TNode& value, + const TSetOptions& options = TSetOptions()) = 0; + + /// + /// @brief Set multiple attributes for cypress path. + /// + /// @param path Path to root of the attributes to be set e.g. "//path/to/table/@"; + /// it is important to make sure that path ends with "/@". + /// @param attributes Map with attributes + /// @param options Optional parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#multiset_attributes) + virtual void MultisetAttributes( + const TYPath& path, + const TNode::TMapType& attributes, + const TMultisetAttributesOptions& options = TMultisetAttributesOptions()) = 0; + + /// + /// @brief List Cypress map or attribute node keys. + /// + /// @param path Path in the tree to the node in question. + /// @param options Optional parameters. + /// + /// @return List of keys with attributes (if they were required in @ref NYT::TListOptions::AttributeFilter). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#list) + virtual TNode::TListType List( + const TYPath& path, + const TListOptions& options = TListOptions()) = 0; + + /// + /// @brief Copy Cypress node. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#copy) + virtual TNodeId Copy( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TCopyOptions& options = TCopyOptions()) = 0; + + /// + /// @brief Move Cypress node (equivalent to copy-then-remove). + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#move) + virtual TNodeId Move( + const TYPath& sourcePath, + const TYPath& destinationPath, + const TMoveOptions& options = TMoveOptions()) = 0; + + /// + /// @brief Create link to Cypress node. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#link) + virtual TNodeId Link( + const TYPath& targetPath, + const TYPath& linkPath, + const TLinkOptions& options = TLinkOptions()) = 0; + + /// + /// @brief Concatenate several tables into one. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#concatenate) + virtual void Concatenate( + const TVector<TRichYPath>& sourcePaths, + const TRichYPath& destinationPath, + const TConcatenateOptions& options = TConcatenateOptions()) = 0; + + /// + /// @brief Concatenate several tables into one. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#concatenate) + virtual void Concatenate( + const TVector<TYPath>& sourcePaths, + const TYPath& destinationPath, + const TConcatenateOptions& options = TConcatenateOptions()); + + /// + /// @brief Canonize YPath, moving all the complex YPath features to attributes. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#parse-ypath) + virtual TRichYPath CanonizeYPath(const TRichYPath& path) = 0; + + /// + /// @brief Get statistics for given sets of columns in given table ranges. + /// + /// @note Paths must contain column selectors. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-table-columnar-statistics) + virtual TVector<TTableColumnarStatistics> GetTableColumnarStatistics( + const TVector<TRichYPath>& paths, + const TGetTableColumnarStatisticsOptions& options = {}) = 0; + + /// + /// @brief Divide input tables into disjoint partitions. + /// + /// Resulted partitions are vectors of rich YPaths. + /// Each partition can be given to a separate worker for further independent processing. + /// + virtual TMultiTablePartitions GetTablePartitions( + const TVector<TRichYPath>& paths, + const TGetTablePartitionsOptions& options) = 0; + + /// + /// @brief Get file from file cache. + /// + /// @param md5Signature MD5 digest of the file. + /// @param cachePath Path to the file cache. + /// @param options Optional parameters. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#get-file-from-cache) + virtual TMaybe<TYPath> GetFileFromCache( + const TString& md5Signature, + const TYPath& cachePath, + const TGetFileFromCacheOptions& options = TGetFileFromCacheOptions()) = 0; + + /// + /// @brief Put file to file cache. + /// + /// @param filePath Path in Cypress to the file to cache. + /// @param md5Signature Expected MD5 digest of the file. + /// @param cachePath Path to the file cache. + /// @param options Optional parameters. + /// + /// @note The file in `filePath` must have been written with @ref NYT::TFileWriterOptions::ComputeMD5 set to `true`. + /// + /// @see [YT doc](https://yt.yandex-team.ru/docs/api/commands.html#put-file-to-cache) + virtual TYPath PutFileToCache( + const TYPath& filePath, + const TString& md5Signature, + const TYPath& cachePath, + const TPutFileToCacheOptions& options = TPutFileToCacheOptions()) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRowType> +TNodeId ICypressClient::CreateTable( + const TYPath& path, + const TSortColumns& sortColumns, + const TCreateOptions& options) +{ + static_assert( + std::is_base_of_v<::google::protobuf::Message, TRowType>, + "TRowType must be inherited from google::protobuf::Message"); + + TCreateOptions actualOptions = options; + if (!actualOptions.Attributes_) { + actualOptions.Attributes_ = TNode::CreateMap(); + } + + if (!actualOptions.Attributes_->HasKey("schema")) { + actualOptions.Attributes_->AsMap().emplace( + "schema", + CreateTableSchema<TRowType>(sortColumns).ToNode()); + } + + return Create(path, ENodeType::NT_TABLE, actualOptions); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/error_codes.h b/yt/cpp/mapreduce/interface/error_codes.h new file mode 100644 index 0000000000..d8d76e04fd --- /dev/null +++ b/yt/cpp/mapreduce/interface/error_codes.h @@ -0,0 +1,468 @@ +#pragma once + +// +// generated by generate-error-codes.py +// + +namespace NYT { +namespace NClusterErrorCodes { + + + +// from ./core/misc/public.h + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int OK = 0; + constexpr int Generic = 1; + constexpr int Canceled = 2; + constexpr int Timeout = 3; + +//////////////////////////////////////////////////////////////////////////////// + + + + +// from ./core/rpc/public.h +namespace NRpc { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TransportError = 100; + constexpr int ProtocolError = 101; + constexpr int NoSuchService = 102; + constexpr int NoSuchMethod = 103; + constexpr int Unavailable = 105; + constexpr int PoisonPill = 106; + constexpr int RequestQueueSizeLimitExceeded = 108; + constexpr int AuthenticationError = 109; + constexpr int InvalidCsrfToken = 110; + constexpr int InvalidCredentials = 111; + constexpr int StreamingNotSupported = 112; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NRpc + + + +// from ./core/bus/public.h +namespace NBus { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TransportError = 100; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NBus + + + +// from ./client/scheduler/public.h +namespace NScheduler { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchOperation = 200; + constexpr int InvalidOperationState = 201; + constexpr int TooManyOperations = 202; + constexpr int NoSuchJob = 203; + constexpr int OperationFailedOnJobRestart = 210; + constexpr int OperationFailedWithInconsistentLocking = 211; + constexpr int OperationControllerCrashed = 212; + constexpr int TestingError = 213; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NScheduler + + + +// from ./client/table_client/public.h +namespace NTableClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int SortOrderViolation = 301; + constexpr int InvalidDoubleValue = 302; + constexpr int IncomparableType = 303; + constexpr int UnhashableType = 304; + // E.g. name table with more than #MaxColumnId columns (may come from legacy chunks). + constexpr int CorruptedNameTable = 305; + constexpr int UniqueKeyViolation = 306; + constexpr int SchemaViolation = 307; + constexpr int RowWeightLimitExceeded = 308; + constexpr int InvalidColumnFilter = 309; + constexpr int InvalidColumnRenaming = 310; + constexpr int IncompatibleKeyColumns = 311; + constexpr int ReaderDeadlineExpired = 312; + constexpr int TimestampOutOfRange = 313; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NTableClient + + + +// from ./client/cypress_client/public.h +namespace NCypressClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int SameTransactionLockConflict = 400; + constexpr int DescendantTransactionLockConflict = 401; + constexpr int ConcurrentTransactionLockConflict = 402; + constexpr int PendingLockConflict = 403; + constexpr int LockDestroyed = 404; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NCypressClient + + + +// from ./core/ytree/public.h +namespace NYTree { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ResolveError = 500; + constexpr int AlreadyExists = 501; + constexpr int MaxChildCountViolation = 502; + constexpr int MaxStringLengthViolation = 503; + constexpr int MaxAttributeSizeViolation = 504; + constexpr int MaxKeyLengthViolation = 505; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYTree + + + +// from ./client/hydra/public.h +namespace NHydra { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchSnapshot = 600; + constexpr int NoSuchChangelog = 601; + constexpr int InvalidEpoch = 602; + constexpr int InvalidVersion = 603; + constexpr int OutOfOrderMutations = 609; + constexpr int InvalidSnapshotVersion = 610; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NHydra + + + +// from ./client/chunk_client/public.h +namespace NChunkClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int AllTargetNodesFailed = 700; + constexpr int SendBlocksFailed = 701; + constexpr int NoSuchSession = 702; + constexpr int SessionAlreadyExists = 703; + constexpr int ChunkAlreadyExists = 704; + constexpr int WindowError = 705; + constexpr int BlockContentMismatch = 706; + constexpr int NoSuchBlock = 707; + constexpr int NoSuchChunk = 708; + constexpr int NoLocationAvailable = 710; + constexpr int IOError = 711; + constexpr int MasterCommunicationFailed = 712; + constexpr int NoSuchChunkTree = 713; + constexpr int MasterNotConnected = 714; + constexpr int ChunkUnavailable = 716; + constexpr int NoSuchChunkList = 717; + constexpr int WriteThrottlingActive = 718; + constexpr int NoSuchMedium = 719; + constexpr int OptimisticLockFailure = 720; + constexpr int InvalidBlockChecksum = 721; + constexpr int BlockOutOfRange = 722; + constexpr int ObjectNotReplicated = 723; + constexpr int MissingExtension = 724; + constexpr int BandwidthThrottlingFailed = 725; + constexpr int ReaderTimeout = 726; + constexpr int NoSuchChunkView = 727; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NChunkClient + + + +// from ./client/election/public.h +namespace NElection { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int InvalidState = 800; + constexpr int InvalidLeader = 801; + constexpr int InvalidEpoch = 802; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NElection + + + +// from ./client/security_client/public.h +namespace NSecurityClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int AuthenticationError = 900; + constexpr int AuthorizationError = 901; + constexpr int AccountLimitExceeded = 902; + constexpr int UserBanned = 903; + constexpr int RequestQueueSizeLimitExceeded = 904; + constexpr int NoSuchAccount = 905; + constexpr int SafeModeEnabled = 906; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NSecurityClient + + + +// from ./client/object_client/public.h +namespace NObjectClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int PrerequisiteCheckFailed = 1000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NObjectClient + + + +// from ./server/lib/exec_agent/public.h +namespace NExecAgent { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ConfigCreationFailed = 1100; + constexpr int AbortByScheduler = 1101; + constexpr int ResourceOverdraft = 1102; + constexpr int WaitingJobTimeout = 1103; + constexpr int SlotNotFound = 1104; + constexpr int JobEnvironmentDisabled = 1105; + constexpr int JobProxyConnectionFailed = 1106; + constexpr int ArtifactCopyingFailed = 1107; + constexpr int NodeDirectoryPreparationFailed = 1108; + constexpr int SlotLocationDisabled = 1109; + constexpr int QuotaSettingFailed = 1110; + constexpr int RootVolumePreparationFailed = 1111; + constexpr int NotEnoughDiskSpace = 1112; + constexpr int ArtifactDownloadFailed = 1113; + constexpr int JobProxyPreparationTimeout = 1114; + constexpr int JobPreparationTimeout = 1115; + constexpr int JobProxyFailed = 1120; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NExecAgent + + + +// from ./ytlib/job_proxy/public.h +namespace NJobProxy { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int MemoryLimitExceeded = 1200; + constexpr int MemoryCheckFailed = 1201; + constexpr int JobTimeLimitExceeded = 1202; + constexpr int UnsupportedJobType = 1203; + constexpr int JobNotPrepared = 1204; + constexpr int UserJobFailed = 1205; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NJobProxy + + + +// from ./server/node/data_node/public.h +namespace NDataNode { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int LocalChunkReaderFailed = 1300; + constexpr int LayerUnpackingFailed = 1301; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDataNode + + + +// from ./core/net/public.h +namespace NNet { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int Aborted = 1500; + constexpr int ResolveTimedOut = 1501; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NNet + + + +// from ./client/node_tracker_client/public.h +namespace NNodeTrackerClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchNode = 1600; + constexpr int InvalidState = 1601; + constexpr int NoSuchNetwork = 1602; + constexpr int NoSuchRack = 1603; + constexpr int NoSuchDataCenter = 1604; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NNodeTrackerClient + + + +// from ./client/tablet_client/public.h +namespace NTabletClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TransactionLockConflict = 1700; + constexpr int NoSuchTablet = 1701; + constexpr int TabletNotMounted = 1702; + constexpr int AllWritesDisabled = 1703; + constexpr int InvalidMountRevision = 1704; + constexpr int TableReplicaAlreadyExists = 1705; + constexpr int InvalidTabletState = 1706; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NTabletClient + + + +// from ./server/lib/shell/public.h +namespace NShell { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ShellExited = 1800; + constexpr int ShellManagerShutDown = 1801; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NShell + + + +// from ./client/api/public.h +namespace NApi { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int TooManyConcurrentRequests = 1900; + constexpr int JobArchiveUnavailable = 1910; + constexpr int RetriableArchiveError = 1911; + constexpr int NoSuchOperation = 1915; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NApi + + + +// from ./server/controller_agent/chunk_pools/public.h +namespace NChunkPools { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int DataSliceLimitExceeded = 2000; + constexpr int MaxDataWeightPerJobExceeded = 2001; + constexpr int MaxPrimaryDataWeightPerJobExceeded = 2002; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NChunkPools + + + +// from ./client/api/rpc_proxy/public.h +namespace NApi { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int ProxyBanned = 2100; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NApi + + + +// from ./ytlib/controller_agent/public.h +namespace NControllerAgent { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int AgentCallFailed = 4400; + constexpr int NoOnlineNodeToScheduleJob = 4410; + constexpr int MaterializationFailed = 4415; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NControllerAgent + + + +// from ./client/transaction_client/public.h +namespace NTransactionClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int NoSuchTransaction = 11000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NTransactionClient + + + +// from ./server/lib/containers/public.h +namespace NContainers { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int FailedToStartContainer = 13000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NContainers + + + +// from ./ytlib/job_prober_client/public.h +namespace NJobProberClient { + +//////////////////////////////////////////////////////////////////////////////// + + constexpr int JobIsNotRunning = 17000; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NJobProberClient + +} // namespace NClusterErrorCodes +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/error_ut.cpp b/yt/cpp/mapreduce/interface/error_ut.cpp new file mode 100644 index 0000000000..03f2751b23 --- /dev/null +++ b/yt/cpp/mapreduce/interface/error_ut.cpp @@ -0,0 +1,81 @@ +#include <library/cpp/testing/unittest/registar.h> + +#include <library/cpp/json/json_reader.h> + +#include <yt/cpp/mapreduce/interface/errors.h> +#include <yt/cpp/mapreduce/common/helpers.h> + +using namespace NYT; + +template<> +void Out<NYT::TNode>(IOutputStream& s, const NYT::TNode& node) +{ + s << "TNode:" << NodeToYsonString(node); +} + +Y_UNIT_TEST_SUITE(ErrorSuite) +{ + Y_UNIT_TEST(TestParseJson) + { + // Scary real world error! Бу! + const char* jsonText = + R"""({)""" + R"""("code":500,)""" + R"""("message":"Error resolving path //home/user/link",)""" + R"""("attributes":{)""" + R"""("fid":18446484571700269066,)""" + R"""("method":"Create",)""" + R"""("tid":17558639495721339338,)""" + R"""("datetime":"2017-04-07T13:38:56.474819Z",)""" + R"""("pid":414529,)""" + R"""("host":"build01-01g.yt.yandex.net"},)""" + R"""("inner_errors":[{)""" + R"""("code":1,)""" + R"""("message":"Node //tt cannot have children",)""" + R"""("attributes":{)""" + R"""("fid":18446484571700269066,)""" + R"""("tid":17558639495721339338,)""" + R"""("datetime":"2017-04-07T13:38:56.474725Z",)""" + R"""("pid":414529,)""" + R"""("host":"build01-01g.yt.yandex.net"},)""" + R"""("inner_errors":[]}]})"""; + + NJson::TJsonValue jsonValue; + ReadJsonFastTree(jsonText, &jsonValue, /*throwOnError=*/ true); + + TYtError error(jsonValue); + UNIT_ASSERT_VALUES_EQUAL(error.GetCode(), 500); + UNIT_ASSERT_VALUES_EQUAL(error.GetMessage(), R"""(Error resolving path //home/user/link)"""); + UNIT_ASSERT_VALUES_EQUAL(error.InnerErrors().size(), 1); + UNIT_ASSERT_VALUES_EQUAL(error.InnerErrors()[0].GetCode(), 1); + + UNIT_ASSERT_VALUES_EQUAL(error.HasAttributes(), true); + UNIT_ASSERT_VALUES_EQUAL(error.GetAttributes().at("method"), TNode("Create")); + + UNIT_ASSERT_VALUES_EQUAL(error.GetAllErrorCodes(), TSet<int>({500, 1})); + } + + Y_UNIT_TEST(TestGetYsonText) { + const char* jsonText = + R"""({)""" + R"""("code":500,)""" + R"""("message":"outer error",)""" + R"""("attributes":{)""" + R"""("method":"Create",)""" + R"""("pid":414529},)""" + R"""("inner_errors":[{)""" + R"""("code":1,)""" + R"""("message":"inner error",)""" + R"""("attributes":{},)""" + R"""("inner_errors":[])""" + R"""(}]})"""; + TYtError error; + error.ParseFrom(jsonText); + TString ysonText = error.GetYsonText(); + TYtError error2(NodeFromYsonString(ysonText)); + UNIT_ASSERT_EQUAL( + ysonText, + R"""({"code"=500;"message"="outer error";"attributes"={"method"="Create";"pid"=414529};"inner_errors"=[{"code"=1;"message"="inner error"}]})"""); + UNIT_ASSERT_EQUAL(error2.GetYsonText(), ysonText); + } +} diff --git a/yt/cpp/mapreduce/interface/errors.cpp b/yt/cpp/mapreduce/interface/errors.cpp new file mode 100644 index 0000000000..49a7c7cfc1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/errors.cpp @@ -0,0 +1,437 @@ +#include "errors.h" + +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/yson/node/node_visitor.h> + +#include <yt/cpp/mapreduce/interface/error_codes.h> + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/yson/writer.h> + +#include <util/string/builder.h> +#include <util/stream/str.h> +#include <util/generic/set.h> + +namespace NYT { + +using namespace NJson; + +//////////////////////////////////////////////////////////////////// + +static void WriteErrorDescription(const TYtError& error, IOutputStream* out) +{ + (*out) << '\'' << error.GetMessage() << '\''; + const auto& innerErrorList = error.InnerErrors(); + if (!innerErrorList.empty()) { + (*out) << " { "; + bool first = true; + for (const auto& innerError : innerErrorList) { + if (first) { + first = false; + } else { + (*out) << " ; "; + } + WriteErrorDescription(innerError, out); + } + (*out) << " }"; + } +} + +static void SerializeError(const TYtError& error, NYson::IYsonConsumer* consumer) +{ + consumer->OnBeginMap(); + { + consumer->OnKeyedItem("code"); + consumer->OnInt64Scalar(error.GetCode()); + + consumer->OnKeyedItem("message"); + consumer->OnStringScalar(error.GetMessage()); + + if (!error.GetAttributes().empty()) { + consumer->OnKeyedItem("attributes"); + consumer->OnBeginMap(); + { + for (const auto& item : error.GetAttributes()) { + consumer->OnKeyedItem(item.first); + TNodeVisitor(consumer).Visit(item.second); + } + } + consumer->OnEndMap(); + } + + if (!error.InnerErrors().empty()) { + consumer->OnKeyedItem("inner_errors"); + { + consumer->OnBeginList(); + for (const auto& innerError : error.InnerErrors()) { + SerializeError(innerError, consumer); + } + consumer->OnEndList(); + } + } + } + consumer->OnEndMap(); +} + +static TString DumpJobInfoForException(const TOperationId& operationId, const TVector<TFailedJobInfo>& failedJobInfoList) +{ + ::TStringBuilder output; + // Exceptions have limit to contain 65508 bytes of text, so we also limit stderr text + constexpr size_t MAX_SIZE = 65508 / 2; + + size_t written = 0; + for (const auto& failedJobInfo : failedJobInfoList) { + if (written >= MAX_SIZE) { + break; + } + TStringStream nextChunk; + nextChunk << '\n'; + nextChunk << "OperationId: " << GetGuidAsString(operationId) << " JobId: " << GetGuidAsString(failedJobInfo.JobId) << '\n'; + nextChunk << "Error: " << failedJobInfo.Error.FullDescription() << '\n'; + if (!failedJobInfo.Stderr.empty()) { + nextChunk << "Stderr: " << Endl; + size_t tmpWritten = written + nextChunk.Str().size(); + if (tmpWritten >= MAX_SIZE) { + break; + } + + if (tmpWritten + failedJobInfo.Stderr.size() > MAX_SIZE) { + nextChunk << failedJobInfo.Stderr.substr(failedJobInfo.Stderr.size() - (MAX_SIZE - tmpWritten)); + } else { + nextChunk << failedJobInfo.Stderr; + } + } + written += nextChunk.Str().size(); + output << nextChunk.Str(); + } + return output; +} + +//////////////////////////////////////////////////////////////////// + +TYtError::TYtError() + : Code_(0) +{ } + +TYtError::TYtError(const TString& message) + : Code_(NYT::NClusterErrorCodes::Generic) + , Message_(message) +{ } + +TYtError::TYtError(int code, const TString& message) + : Code_(code) + , Message_(message) +{ } + +TYtError::TYtError(const TJsonValue& value) +{ + const TJsonValue::TMapType& map = value.GetMap(); + TJsonValue::TMapType::const_iterator it = map.find("message"); + if (it != map.end()) { + Message_ = it->second.GetString(); + } + + it = map.find("code"); + if (it != map.end()) { + Code_ = static_cast<int>(it->second.GetInteger()); + } else { + Code_ = NYT::NClusterErrorCodes::Generic; + } + + it = map.find("inner_errors"); + if (it != map.end()) { + const TJsonValue::TArray& innerErrors = it->second.GetArray(); + for (const auto& innerError : innerErrors) { + InnerErrors_.push_back(TYtError(innerError)); + } + } + + it = map.find("attributes"); + if (it != map.end()) { + auto attributes = NYT::NodeFromJsonValue(it->second); + if (attributes.IsMap()) { + Attributes_ = std::move(attributes.AsMap()); + } + } +} + +TYtError::TYtError(const TNode& node) +{ + const auto& map = node.AsMap(); + auto it = map.find("message"); + if (it != map.end()) { + Message_ = it->second.AsString(); + } + + it = map.find("code"); + if (it != map.end()) { + Code_ = static_cast<int>(it->second.AsInt64()); + } else { + Code_ = NYT::NClusterErrorCodes::Generic; + } + + it = map.find("inner_errors"); + if (it != map.end()) { + const auto& innerErrors = it->second.AsList(); + for (const auto& innerError : innerErrors) { + InnerErrors_.push_back(TYtError(innerError)); + } + } + + it = map.find("attributes"); + if (it != map.end()) { + auto& attributes = it->second; + if (attributes.IsMap()) { + Attributes_ = std::move(attributes.AsMap()); + } + } +} + +int TYtError::GetCode() const +{ + return Code_; +} + +const TString& TYtError::GetMessage() const +{ + return Message_; +} + +const TVector<TYtError>& TYtError::InnerErrors() const +{ + return InnerErrors_; +} + +void TYtError::ParseFrom(const TString& jsonError) +{ + TJsonValue value; + TStringInput input(jsonError); + ReadJsonTree(&input, &value); + *this = TYtError(value); +} + +TSet<int> TYtError::GetAllErrorCodes() const +{ + TDeque<const TYtError*> queue = {this}; + TSet<int> result; + while (!queue.empty()) { + const auto* current = queue.front(); + queue.pop_front(); + result.insert(current->Code_); + for (const auto& error : current->InnerErrors_) { + queue.push_back(&error); + } + } + return result; +} + +bool TYtError::ContainsErrorCode(int code) const +{ + if (Code_ == code) { + return true; + } + for (const auto& error : InnerErrors_) { + if (error.ContainsErrorCode(code)) { + return true; + } + } + return false; +} + + +bool TYtError::ContainsText(const TStringBuf& text) const +{ + if (Message_.Contains(text)) { + return true; + } + for (const auto& error : InnerErrors_) { + if (error.ContainsText(text)) { + return true; + } + } + return false; +} + +bool TYtError::HasAttributes() const +{ + return !Attributes_.empty(); +} + +const TNode::TMapType& TYtError::GetAttributes() const +{ + return Attributes_; +} + +TString TYtError::GetYsonText() const +{ + TStringStream out; + ::NYson::TYsonWriter writer(&out, NYson::EYsonFormat::Text); + SerializeError(*this, &writer); + return std::move(out.Str()); +} + +TString TYtError::ShortDescription() const +{ + TStringStream out; + WriteErrorDescription(*this, &out); + return std::move(out.Str()); +} + +TString TYtError::FullDescription() const +{ + TStringStream s; + WriteErrorDescription(*this, &s); + s << "; full error: " << GetYsonText(); + return s.Str(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TErrorResponse::TErrorResponse(int httpCode, const TString& requestId) + : HttpCode_(httpCode) + , RequestId_(requestId) +{ } + +bool TErrorResponse::IsOk() const +{ + return Error_.GetCode() == 0; +} + +void TErrorResponse::SetRawError(const TString& message) +{ + Error_ = TYtError(message); + Setup(); +} + +void TErrorResponse::SetError(TYtError error) +{ + Error_ = std::move(error); + Setup(); +} + +void TErrorResponse::ParseFromJsonError(const TString& jsonError) +{ + Error_.ParseFrom(jsonError); + Setup(); +} + +void TErrorResponse::SetIsFromTrailers(bool isFromTrailers) +{ + IsFromTrailers_ = isFromTrailers; +} + +int TErrorResponse::GetHttpCode() const +{ + return HttpCode_; +} + +bool TErrorResponse::IsFromTrailers() const +{ + return IsFromTrailers_; +} + +bool TErrorResponse::IsTransportError() const +{ + return HttpCode_ == 503; +} + +TString TErrorResponse::GetRequestId() const +{ + return RequestId_; +} + +const TYtError& TErrorResponse::GetError() const +{ + return Error_; +} + +bool TErrorResponse::IsResolveError() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NYTree::ResolveError); +} + +bool TErrorResponse::IsAccessDenied() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NSecurityClient::AuthorizationError); +} + +bool TErrorResponse::IsConcurrentTransactionLockConflict() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NCypressClient::ConcurrentTransactionLockConflict); +} + +bool TErrorResponse::IsRequestRateLimitExceeded() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NSecurityClient::RequestQueueSizeLimitExceeded); +} + +bool TErrorResponse::IsRequestQueueSizeLimitExceeded() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NRpc::RequestQueueSizeLimitExceeded); +} + +bool TErrorResponse::IsChunkUnavailable() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NChunkClient::ChunkUnavailable); +} + +bool TErrorResponse::IsRequestTimedOut() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::Timeout); +} + +bool TErrorResponse::IsNoSuchTransaction() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NTransactionClient::NoSuchTransaction); +} + +bool TErrorResponse::IsConcurrentOperationsLimitReached() const +{ + return Error_.ContainsErrorCode(NClusterErrorCodes::NScheduler::TooManyOperations); +} + +void TErrorResponse::Setup() +{ + TStringStream s; + *this << Error_.FullDescription(); +} + +//////////////////////////////////////////////////////////////////// + +TOperationFailedError::TOperationFailedError( + EState state, + TOperationId id, + TYtError ytError, + TVector<TFailedJobInfo> failedJobInfo) + : State_(state) + , OperationId_(id) + , Error_(std::move(ytError)) + , FailedJobInfo_(std::move(failedJobInfo)) +{ + *this << Error_.FullDescription(); + if (!FailedJobInfo_.empty()) { + *this << DumpJobInfoForException(OperationId_, FailedJobInfo_); + } +} + +TOperationFailedError::EState TOperationFailedError::GetState() const +{ + return State_; +} + +TOperationId TOperationFailedError::GetOperationId() const +{ + return OperationId_; +} + +const TYtError& TOperationFailedError::GetError() const +{ + return Error_; +} + +const TVector<TFailedJobInfo>& TOperationFailedError::GetFailedJobInfo() const +{ + return FailedJobInfo_; +} + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/errors.h b/yt/cpp/mapreduce/interface/errors.h new file mode 100644 index 0000000000..afad58ed72 --- /dev/null +++ b/yt/cpp/mapreduce/interface/errors.h @@ -0,0 +1,290 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/errors.h +/// +/// Errors and exceptions emitted by library. + +#include "fwd.h" +#include "common.h" + +#include <library/cpp/yson/node/node.h> + +#include <util/generic/bt_exception.h> +#include <util/generic/yexception.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> + +namespace NJson { + class TJsonValue; +} // namespace NJson + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Error that is thrown when library detects invalid usage of API. +/// +/// For example trying to start operations on empty table list. +class TApiUsageError + : public TWithBackTrace<yexception> +{ }; + +/// +/// @brief Error that is thrown when request retries continues for too long. +/// +/// @see NYT::TRetryConfig +/// @see NYT::IRetryConfigProvider +class TRequestRetriesTimeout + : public yexception +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Error returned by YT cluster. +/// +/// An object of this class describe error that happened on YT server. +/// Internally each error is a tree. Each node of the tree contains: +/// - integer error code; +/// - text description of error; +/// - attributes describing error context. +/// +/// To get text description of an error one should use +/// @ref NYT::TYtError::ShortDescription or @ref NYT::TYtError::FullDescription +/// +/// To distinguish between error kinds @ref NYT::TYtError::ContainsErrorCode should be used. +/// +/// @see NYT::TErrorResponse +/// @see NYT::TOperationFailedError +class TYtError +{ +public: + /// Constructs error with NYT::NClusterErrorCodes::OK code and empty message. + TYtError(); + + /// Constructs error with NYT::NClusterErrorCodes::Generic code and given message. + explicit TYtError(const TString& message); + + /// Constructs error with given code and given message. + TYtError(int code, const TString& message); + + /// Construct error from json representation. + TYtError(const ::NJson::TJsonValue& value); + + /// Construct error from TNode representation. + TYtError(const TNode& value); + + /// + /// @brief Check if error or any of inner errors has given error code. + /// + /// Use this method to distinguish kind of error. + bool ContainsErrorCode(int code) const; + + /// + /// @brief Get short description of error. + /// + /// Short description contain text description of error and all inner errors. + /// It is human readable but misses some important information (error codes, error attributes). + /// + /// Usually it's better to use @ref NYT::TYtError::FullDescription to log errors. + TString ShortDescription() const; + + /// + /// @brief Get full description of error. + /// + /// Full description contains readable short description + /// followed by text yson representation of error that contains error codes and attributes. + TString FullDescription() const; + + /// + /// @brief Get error code of the topmost error. + /// + /// @warning Do not use this method to distinguish between error kinds + /// @ref NYT::TYtError::ContainsErrorCode should be used instead. + int GetCode() const; + + /// + /// @brief Get error text of the topmost error. + /// + /// @warning This method should not be used to log errors + /// since text description of inner errors is going to be lost. + /// @ref NYT::TYtError::FullDescription should be used instead. + const TString& GetMessage() const; + + /// + /// @brief Check if error or any of inner errors contains given text chunk. + /// + /// @warning @ref NYT::TYtError::ContainsErrorCode must be used instead of + /// this method when possible. If there is no suitable error code it's + /// better to ask yt@ to add one. This method should only be used as workaround. + bool ContainsText(const TStringBuf& text) const; + + /// @brief Get inner errors. + const TVector<TYtError>& InnerErrors() const; + + /// Parse error from json string. + void ParseFrom(const TString& jsonError); + + /// Collect error codes from entire error tree. + TSet<int> GetAllErrorCodes() const; + + /// Check if error has any attributes. + bool HasAttributes() const; + + /// Get error attributes. + const TNode::TMapType& GetAttributes() const; + + /// Get text yson representation of error + TString GetYsonText() const; + +private: + int Code_; + TString Message_; + TVector<TYtError> InnerErrors_; + TNode::TMapType Attributes_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Generic error response returned by server. +/// +/// TErrorResponse can be thrown from almost any client method when server responds with error. +/// +class TErrorResponse + : public yexception +{ +public: + TErrorResponse(int httpCode, const TString& requestId); + TErrorResponse(int httpCode, TYtError error); + + /// Get error object returned by server. + const TYtError& GetError() const; + + /// Get if (correlation-id) of request that was responded with error. + TString GetRequestId() const; + + /// Get HTTP code of response. + int GetHttpCode() const; + + /// Is error parsed from response trailers. + bool IsFromTrailers() const; + + /// Check if error was caused by transport problems inside YT cluster. + bool IsTransportError() const; + + /// Check if error was caused by failure to resolve cypress path. + bool IsResolveError() const; + + /// Check if error was caused by lack of permissions to execute request. + bool IsAccessDenied() const; + + /// Check if error was caused by failure to lock object because of another transaction is holding lock. + bool IsConcurrentTransactionLockConflict() const; + + /// Check if error was caused by request quota limit exceeding. + bool IsRequestRateLimitExceeded() const; + + // YT can't serve request because it is overloaded. + bool IsRequestQueueSizeLimitExceeded() const; + + /// Check if error was caused by failure to get chunk. Such errors are almost always temporary. + bool IsChunkUnavailable() const; + + /// Check if error was caused by internal YT timeout. + bool IsRequestTimedOut() const; + + /// Check if error was caused by trying to work with transaction that was finished or never existed. + bool IsNoSuchTransaction() const; + + // User reached their limit of concurrently running operations. + bool IsConcurrentOperationsLimitReached() const; + + /// @deprecated This method must not be used. + bool IsOk() const; + + void SetRawError(const TString& message); + void SetError(TYtError error); + void ParseFromJsonError(const TString& jsonError); + void SetIsFromTrailers(bool isFromTrailers); + +private: + void Setup(); + +private: + int HttpCode_; + TString RequestId_; + TYtError Error_; + bool IsFromTrailers_ = false; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Info about failed jobs. +/// +/// @see NYT::TOperationFailedError +struct TFailedJobInfo +{ + /// Id of a job. + TJobId JobId; + + /// Error describing job failure. + TYtError Error; + + /// Stderr of job. + /// + /// @note YT doesn't store all job stderrs, check @ref NYT::IOperationClient::GetJobStderr + /// for list of limitations. + /// + /// @see NYT::IOperationClient::GetJobStderr + TString Stderr; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Error that is thrown when operation watched by library fails. +/// +/// This error is thrown from operation starting methods when they are started in sync mode (@ refNYT::TOperationOptions::Wait == true) +/// or from future returned by NYT::IOperation::Watch. +/// +/// @see NYT::IOperationClient +class TOperationFailedError + : public yexception +{ +public: + /// Final state of operation. + enum EState { + /// Operation was failed due to some error. + Failed, + /// Operation didn't experienced errors, but was aborted by user request or by YT. + Aborted, + }; + +public: + TOperationFailedError(EState state, TOperationId id, TYtError ytError, TVector<TFailedJobInfo> failedJobInfo); + + /// Get final state of operation. + EState GetState() const; + + /// Get operation id. + TOperationId GetOperationId() const; + + /// Return operation error. + const TYtError& GetError() const; + + /// Return info about failed jobs (if any). + const TVector<TFailedJobInfo>& GetFailedJobInfo() const; + +private: + EState State_; + TOperationId OperationId_; + TYtError Error_; + TVector<TFailedJobInfo> FailedJobInfo_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/finish_or_die.h b/yt/cpp/mapreduce/interface/finish_or_die.h new file mode 100644 index 0000000000..9d7dcece02 --- /dev/null +++ b/yt/cpp/mapreduce/interface/finish_or_die.h @@ -0,0 +1,41 @@ +#pragma once + +#include <util/system/yassert.h> + +#include <exception> + +/// @cond Doxygen_Suppress +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +void FinishOrDie(T* pThis, const char* className) noexcept +{ + auto fail = [&] (const char* what) { + Y_FAIL( + "\n\n" + "Destructor of %s caught exception during Finish: %s.\n" + "Some data is probably has not been written.\n" + "In order to handle such exceptions consider explicitly call Finish() method.\n", + className, + what); + }; + + try { + pThis->Finish(); + } catch (const std::exception& ex) { + if (!std::uncaught_exceptions()) { + fail(ex.what()); + } + } catch (...) { + if (!std::uncaught_exceptions()) { + fail("<unknown exception>"); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail +/// @endcond diff --git a/yt/cpp/mapreduce/interface/fluent.h b/yt/cpp/mapreduce/interface/fluent.h new file mode 100644 index 0000000000..8ca6e86336 --- /dev/null +++ b/yt/cpp/mapreduce/interface/fluent.h @@ -0,0 +1,678 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/fluent.h +/// +/// Adapters for working with @ref NYson::IYsonConsumer in a structured way, with compile-time syntax checks. +/// +/// The following documentation is copied verbatim from `yt/core/ytree/fluent.h`. +/// +/// WHAT IS THIS +/// +/// Fluent adapters encapsulate invocation of IYsonConsumer methods in a +/// convenient structured manner. Key advantage of fluent-like code is that +/// attempt of building syntactically incorrect YSON structure will result +/// in a compile-time error. +/// +/// Each fluent object is associated with a context that defines possible YSON +/// tokens that may appear next. For example, TFluentMap is a fluent object +/// that corresponds to a location within YSON map right before a key-value +/// pair or the end of the map. +/// +/// More precisely, each object that may be obtained by a sequence of fluent +/// method calls has the full history of its enclosing YSON composite types in +/// its single template argument hereinafter referred to as TParent. This allows +/// us not to forget the original context after opening and closing the embedded +/// composite structure. +/// +/// It is possible to invoke a separate YSON building procedure by calling +/// one of convenience Do* methods. There are two possibilities here: it is +/// possible to delegate invocation context either as a fluent object (like +/// TFluentMap, TFluentList, TFluentAttributes or TFluentAny) or as a raw +/// IYsonConsumer*. The latter is discouraged since it is impossible to check +/// if a given side-built YSON structure fits current fluent context. +/// For example it is possible to call Do() method inside YSON map passing +/// consumer to a procedure that will treat context like it is in a list. +/// Passing typed fluent builder saves you from such a misbehaviour. +/// +/// TFluentXxx corresponds to an internal class of TXxx +/// without any history hidden in template argument. It allows you to +/// write procedures of form: +/// +/// void BuildSomeAttributesInYson(TFluentMap fluent) { ... } +/// +/// without thinking about the exact way how this procedure is nested in other +/// procedures. +/// +/// An important notation: we will refer to a function whose first argument +/// is TFluentXxx as TFuncXxx. +/// +/// +/// BRIEF LIST OF AVAILABLE METHODS +/// +/// Only the most popular methods are covered here. Refer to the code for the +/// rest of them. +/// +/// TAny: +/// * Value(T value) -> TParent, serialize `value` using underlying consumer. +/// T should be such that free function Serialize(NYson::IYsonConsumer*, const T&) is +/// defined; +/// * BeginMap() -> TFluentMap, open map; +/// * BeginList() -> TFluentList, open list; +/// * BeginAttributes() -> TFluentAttributes, open attributes; +/// +/// * Do(TFuncAny func) -> TAny, delegate invocation to a separate procedure. +/// * DoIf(bool condition, TFuncAny func) -> TAny, same as Do() but invoke +/// `func` only if `condition` is true; +/// * DoFor(TCollection collection, TFuncAny func) -> TAny, same as Do() +/// but iterate over `collection` and pass each of its elements as a second +/// argument to `func`. Instead of passing a collection you may it is possible +/// to pass two iterators as an argument; +/// +/// * DoMap(TFuncMap func) -> TAny, open a map, delegate invocation to a separate +/// procedure and close map; +/// * DoMapFor(TCollection collection, TFuncMap func) -> TAny, open a map, iterate +/// over `collection` and pass each of its elements as a second argument to `func` +/// and close map; +/// * DoList(TFuncList func) -> TAny, same as DoMap(); +/// * DoListFor(TCollection collection, TFuncList func) -> TAny; same as DoMapFor(). +/// +/// +/// TFluentMap: +/// * Item(TStringBuf key) -> TAny, open an element keyed with `key`; +/// * EndMap() -> TParent, close map; +/// * Do(TFuncMap func) -> TFluentMap, same as Do() for TAny; +/// * DoIf(bool condition, TFuncMap func) -> TFluentMap, same as DoIf() for TAny; +/// * DoFor(TCollection collection, TFuncMap func) -> TFluentMap, same as DoFor() for TAny. +/// +/// +/// TFluentList: +/// * Item() -> TAny, open an new list element; +/// * EndList() -> TParent, close list; +/// * Do(TFuncList func) -> TFluentList, same as Do() for TAny; +/// * DoIf(bool condition, TFuncList func) -> TFluentList, same as DoIf() for TAny; +/// * DoFor(TCollection collection, TListMap func) -> TFluentList, same as DoFor() for TAny. +/// +/// +/// TFluentAttributes: +/// * Item(TStringBuf key) -> TAny, open an element keyed with `key`. +/// * EndAttributes() -> TParentWithoutAttributes, close attributes. Note that +/// this method leads to a context that is forces not to have attributes, +/// preventing us from putting attributes twice before an object. +/// * Do(TFuncAttributes func) -> TFluentAttributes, same as Do() for TAny; +/// * DoIf(bool condition, TFuncAttributes func) -> TFluentAttributes, same as DoIf() +/// for TAny; +/// * DoFor(TCollection collection, TListAttributes func) -> TFluentAttributes, same as DoFor() +/// for TAny. +/// + + +#include "common.h" +#include "serialize.h" + +#include <library/cpp/yson/node/serialize.h> +#include <library/cpp/yson/node/node_builder.h> + +#include <library/cpp/yson/consumer.h> +#include <library/cpp/yson/writer.h> + +#include <util/generic/noncopyable.h> +#include <util/generic/ptr.h> +#include <util/stream/str.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +struct TFluentYsonUnwrapper +{ + using TUnwrapped = T; + + static TUnwrapped Unwrap(T t) + { + return std::move(t); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TFluentYsonVoid +{ }; + +template <> +struct TFluentYsonUnwrapper<TFluentYsonVoid> +{ + using TUnwrapped = void; + + static TUnwrapped Unwrap(TFluentYsonVoid) + { } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// This class is actually a namespace for specific fluent adapter classes. +class TFluentYsonBuilder + : private TNonCopyable +{ +private: + template <class T> + static void WriteValue(NYT::NYson::IYsonConsumer* consumer, const T& value) + { + Serialize(value, consumer); + } + +public: + class TFluentAny; + template <class TParent> class TAny; + template <class TParent> class TToAttributes; + template <class TParent> class TAttributes; + template <class TParent> class TListType; + template <class TParent> class TMapType; + + /// Base class for all fluent adapters. + template <class TParent> + class TFluentBase + { + public: + /// Implicit conversion to yson consumer + operator NYT::NYson::IYsonConsumer* () const + { + return Consumer; + } + + protected: + /// @cond Doxygen_Suppress + NYT::NYson::IYsonConsumer* Consumer; + TParent Parent; + + TFluentBase(NYT::NYson::IYsonConsumer* consumer, TParent parent) + : Consumer(consumer) + , Parent(std::move(parent)) + { } + + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + TUnwrappedParent GetUnwrappedParent() + { + return TFluentYsonUnwrapper<TParent>::Unwrap(std::move(Parent)); + } + /// @endcond Doxygen_Suppress + }; + + /// Base class for fluent adapters for fragment of list, map or attributes. + template <template <class TParent> class TThis, class TParent> + class TFluentFragmentBase + : public TFluentBase<TParent> + { + public: + using TDeepThis = TThis<TParent>; + using TShallowThis = TThis<TFluentYsonVoid>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TFluentFragmentBase(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentBase<TParent>(consumer, std::move(parent)) + { } + + /// Delegate invocation to a separate procedure. + template <class TFunc> + TDeepThis& Do(const TFunc& func) + { + func(TShallowThis(this->Consumer)); + return *static_cast<TDeepThis*>(this); + } + + /// Conditionally delegate invocation to a separate procedure. + template <class TFunc> + TDeepThis& DoIf(bool condition, const TFunc& func) + { + if (condition) { + func(TShallowThis(this->Consumer)); + } + return *static_cast<TDeepThis*>(this); + } + + /// Calls `func(*this, element)` for each `element` in range `[begin, end)`. + template <class TFunc, class TIterator> + TDeepThis& DoFor(const TIterator& begin, const TIterator& end, const TFunc& func) + { + for (auto current = begin; current != end; ++current) { + func(TShallowThis(this->Consumer), current); + } + return *static_cast<TDeepThis*>(this); + } + + /// Calls `func(*this, element)` for each `element` in `collection`. + template <class TFunc, class TCollection> + TDeepThis& DoFor(const TCollection& collection, const TFunc& func) + { + for (const auto& item : collection) { + func(TShallowThis(this->Consumer), item); + } + return *static_cast<TDeepThis*>(this); + } + + }; + + /// Fluent adapter of a value without attributes. + template <class TParent> + class TAnyWithoutAttributes + : public TFluentBase<TParent> + { + public: + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + TAnyWithoutAttributes(NYT::NYson::IYsonConsumer* consumer, TParent parent) + : TFluentBase<TParent>(consumer, std::move(parent)) + { } + + /// Pass `value` to underlying consumer. + template <class T> + TUnwrappedParent Value(const T& value) + { + WriteValue(this->Consumer, value); + return this->GetUnwrappedParent(); + } + + /// Call `OnEntity()` of underlying consumer. + TUnwrappedParent Entity() + { + this->Consumer->OnEntity(); + return this->GetUnwrappedParent(); + } + + /// Serialize `collection` to underlying consumer as a list. + template <class TCollection> + TUnwrappedParent List(const TCollection& collection) + { + this->Consumer->OnBeginList(); + for (const auto& item : collection) { + this->Consumer->OnListItem(); + WriteValue(this->Consumer, item); + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Serialize maximum `maxSize` elements of `collection` to underlying consumer as a list. + template <class TCollection> + TUnwrappedParent ListLimited(const TCollection& collection, size_t maxSize) + { + this->Consumer->OnBeginAttributes(); + this->Consumer->OnKeyedItem("count"); + this->Consumer->OnInt64Scalar(collection.size()); + this->Consumer->OnEndAttributes(); + this->Consumer->OnBeginList(); + size_t printedSize = 0; + for (const auto& item : collection) { + if (printedSize >= maxSize) + break; + this->Consumer->OnListItem(); + WriteValue(this->Consumer, item); + ++printedSize; + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a list. + TListType<TParent> BeginList() + { + this->Consumer->OnBeginList(); + return TListType<TParent>(this->Consumer, this->Parent); + } + + /// Open a list, delegate invocation to `func`, then close the list. + template <class TFunc> + TUnwrappedParent DoList(const TFunc& func) + { + this->Consumer->OnBeginList(); + func(TListType<TFluentYsonVoid>(this->Consumer)); + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a list, call `func(*this, element)` for each `element` of range, then close the list. + template <class TFunc, class TIterator> + TUnwrappedParent DoListFor(const TIterator& begin, const TIterator& end, const TFunc& func) + { + this->Consumer->OnBeginList(); + for (auto current = begin; current != end; ++current) { + func(TListType<TFluentYsonVoid>(this->Consumer), current); + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a list, call `func(*this, element)` for each `element` of `collection`, then close the list. + template <class TFunc, class TCollection> + TUnwrappedParent DoListFor(const TCollection& collection, const TFunc& func) + { + this->Consumer->OnBeginList(); + for (const auto& item : collection) { + func(TListType<TFluentYsonVoid>(this->Consumer), item); + } + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + + /// Open a map. + TMapType<TParent> BeginMap() + { + this->Consumer->OnBeginMap(); + return TMapType<TParent>(this->Consumer, this->Parent); + } + + /// Open a map, delegate invocation to `func`, then close the map. + template <class TFunc> + TUnwrappedParent DoMap(const TFunc& func) + { + this->Consumer->OnBeginMap(); + func(TMapType<TFluentYsonVoid>(this->Consumer)); + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + + /// Open a map, call `func(*this, element)` for each `element` of range, then close the map. + template <class TFunc, class TIterator> + TUnwrappedParent DoMapFor(const TIterator& begin, const TIterator& end, const TFunc& func) + { + this->Consumer->OnBeginMap(); + for (auto current = begin; current != end; ++current) { + func(TMapType<TFluentYsonVoid>(this->Consumer), current); + } + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + + /// Open a map, call `func(*this, element)` for each `element` of `collection`, then close the map. + template <class TFunc, class TCollection> + TUnwrappedParent DoMapFor(const TCollection& collection, const TFunc& func) + { + this->Consumer->OnBeginMap(); + for (const auto& item : collection) { + func(TMapType<TFluentYsonVoid>(this->Consumer), item); + } + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + }; + + /// Fluent adapter of any value. + template <class TParent> + class TAny + : public TAnyWithoutAttributes<TParent> + { + public: + using TBase = TAnyWithoutAttributes<TParent>; + + explicit TAny(NYT::NYson::IYsonConsumer* consumer, TParent parent) + : TBase(consumer, std::move(parent)) + { } + + /// Open attributes. + TAttributes<TBase> BeginAttributes() + { + this->Consumer->OnBeginAttributes(); + return TAttributes<TBase>( + this->Consumer, + TBase(this->Consumer, this->Parent)); + } + }; + + /// Fluent adapter of attributes fragment (the inside part of attributes). + template <class TParent = TFluentYsonVoid> + class TAttributes + : public TFluentFragmentBase<TAttributes, TParent> + { + public: + using TThis = TAttributes<TParent>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TAttributes(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentFragmentBase<TFluentYsonBuilder::TAttributes, TParent>(consumer, std::move(parent)) + { } + + /// Pass attribute key to underlying consumer. + TAny<TThis> Item(const TStringBuf& key) + { + this->Consumer->OnKeyedItem(key); + return TAny<TThis>(this->Consumer, *this); + } + + /// Pass attribute key to underlying consumer. + template <size_t Size> + TAny<TThis> Item(const char (&key)[Size]) + { + return Item(TStringBuf(key, Size - 1)); + } + + //TODO: from TNode + + /// Close the attributes. + TUnwrappedParent EndAttributes() + { + this->Consumer->OnEndAttributes(); + return this->GetUnwrappedParent(); + } + }; + + /// Fluent adapter of list fragment (the inside part of a list). + template <class TParent = TFluentYsonVoid> + class TListType + : public TFluentFragmentBase<TListType, TParent> + { + public: + using TThis = TListType<TParent>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TListType(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentFragmentBase<TFluentYsonBuilder::TListType, TParent>(consumer, std::move(parent)) + { } + + /// Call `OnListItem()` of underlying consumer. + TAny<TThis> Item() + { + this->Consumer->OnListItem(); + return TAny<TThis>(this->Consumer, *this); + } + + // TODO: from TNode + + /// Close the list. + TUnwrappedParent EndList() + { + this->Consumer->OnEndList(); + return this->GetUnwrappedParent(); + } + }; + + /// Fluent adapter of map fragment (the inside part of a map). + template <class TParent = TFluentYsonVoid> + class TMapType + : public TFluentFragmentBase<TMapType, TParent> + { + public: + using TThis = TMapType<TParent>; + using TUnwrappedParent = typename TFluentYsonUnwrapper<TParent>::TUnwrapped; + + explicit TMapType(NYT::NYson::IYsonConsumer* consumer, TParent parent = TParent()) + : TFluentFragmentBase<TFluentYsonBuilder::TMapType, TParent>(consumer, std::move(parent)) + { } + + /// Pass map key to underlying consumer. + template <size_t Size> + TAny<TThis> Item(const char (&key)[Size]) + { + return Item(TStringBuf(key, Size - 1)); + } + + /// Pass map key to underlying consumer. + TAny<TThis> Item(const TStringBuf& key) + { + this->Consumer->OnKeyedItem(key); + return TAny<TThis>(this->Consumer, *this); + } + + // TODO: from TNode + + /// Close the map. + TUnwrappedParent EndMap() + { + this->Consumer->OnEndMap(); + return this->GetUnwrappedParent(); + } + }; + +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Builder representing any value. +using TFluentAny = TFluentYsonBuilder::TAny<TFluentYsonVoid>; + +/// Builder representing the inside of a list (list fragment). +using TFluentList = TFluentYsonBuilder::TListType<TFluentYsonVoid>; + +/// Builder representing the inside of a map (map fragment). +using TFluentMap = TFluentYsonBuilder::TMapType<TFluentYsonVoid>; + +/// Builder representing the inside of attributes. +using TFluentAttributes = TFluentYsonBuilder::TAttributes<TFluentYsonVoid>; + +//////////////////////////////////////////////////////////////////////////////// + +/// Create a fluent adapter to invoke methods of `consumer`. +static inline TFluentAny BuildYsonFluently(NYT::NYson::IYsonConsumer* consumer) +{ + return TFluentAny(consumer, TFluentYsonVoid()); +} + +/// Create a fluent adapter to invoke methods of `consumer` describing the contents of a list. +static inline TFluentList BuildYsonListFluently(NYT::NYson::IYsonConsumer* consumer) +{ + return TFluentList(consumer); +} + +/// Create a fluent adapter to invoke methods of `consumer` describing the contents of a map. +static inline TFluentMap BuildYsonMapFluently(NYT::NYson::IYsonConsumer* consumer) +{ + return TFluentMap(consumer); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TFluentYsonWriterState + : public TThrRefBase +{ +public: + using TValue = TString; + + explicit TFluentYsonWriterState(::NYson::EYsonFormat format) + : Writer(&Output, format) + { } + + TString GetValue() + { + return Output.Str(); + } + + NYT::NYson::IYsonConsumer* GetConsumer() + { + return &Writer; + } + +private: + TStringStream Output; + ::NYson::TYsonWriter Writer; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TFluentYsonBuilderState + : public TThrRefBase +{ +public: + using TValue = TNode; + + explicit TFluentYsonBuilderState() + : Builder(&Node) + { } + + TNode GetValue() + { + return std::move(Node); + } + + NYT::NYson::IYsonConsumer* GetConsumer() + { + return &Builder; + } + +private: + TNode Node; + TNodeBuilder Builder; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TState> +class TFluentYsonHolder +{ +public: + explicit TFluentYsonHolder(::TIntrusivePtr<TState> state) + : State(state) + { } + + ::TIntrusivePtr<TState> GetState() const + { + return State; + } + +private: + ::TIntrusivePtr<TState> State; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TState> +struct TFluentYsonUnwrapper< TFluentYsonHolder<TState> > +{ + using TUnwrapped = typename TState::TValue; + + static TUnwrapped Unwrap(const TFluentYsonHolder<TState>& holder) + { + return std::move(holder.GetState()->GetValue()); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TState> +TFluentYsonBuilder::TAny<TFluentYsonHolder<TState>> +BuildYsonFluentlyWithState(::TIntrusivePtr<TState> state) +{ + return TFluentYsonBuilder::TAny<TFluentYsonHolder<TState>>( + state->GetConsumer(), + TFluentYsonHolder<TState>(state)); +} + +/// Create a fluent adapter returning a `TString` with corresponding YSON when construction is finished. +inline TFluentYsonBuilder::TAny<TFluentYsonHolder<TFluentYsonWriterState>> +BuildYsonStringFluently(::NYson::EYsonFormat format = ::NYson::EYsonFormat::Text) +{ + ::TIntrusivePtr<TFluentYsonWriterState> state(new TFluentYsonWriterState(format)); + return BuildYsonFluentlyWithState(state); +} + +/// Create a fluent adapter returning a @ref NYT::TNode when construction is finished. +inline TFluentYsonBuilder::TAny<TFluentYsonHolder<TFluentYsonBuilderState>> +BuildYsonNodeFluently() +{ + ::TIntrusivePtr<TFluentYsonBuilderState> state(new TFluentYsonBuilderState); + return BuildYsonFluentlyWithState(state); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/format.cpp b/yt/cpp/mapreduce/interface/format.cpp new file mode 100644 index 0000000000..f8318310a4 --- /dev/null +++ b/yt/cpp/mapreduce/interface/format.cpp @@ -0,0 +1,135 @@ +#include "format.h" +#include "protobuf_format.h" + +#include "errors.h" + +#include <google/protobuf/descriptor.h> +#include <google/protobuf/messagext.h> + +namespace NYT { + +TTableSchema CreateTableSchema( + const ::google::protobuf::Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension) +{ + return NDetail::CreateTableSchemaImpl(messageDescriptor, keepFieldsWithoutExtension); +} + +//////////////////////////////////////////////////////////////////////////////// + +TFormat::TFormat(const TNode& config) + : Config(config) +{ } + + +TFormat TFormat::Protobuf( + const TVector<const ::google::protobuf::Descriptor*>& descriptors, + bool withDescriptors) +{ + if (withDescriptors) { + return TFormat(NDetail::MakeProtoFormatConfigWithDescriptors(descriptors)); + } else { + return TFormat(NDetail::MakeProtoFormatConfigWithTables(descriptors)); + } +} + +TFormat TFormat::YsonText() +{ + TNode config("yson"); + config.Attributes()("format", "text"); + return TFormat(config); +} + +TFormat TFormat::YsonBinary() +{ + TNode config("yson"); + config.Attributes()("format", "binary"); + return TFormat(config); +} + +TFormat TFormat::YaMRLenval() +{ + TNode config("yamr"); + config.Attributes()("lenval", true)("has_subkey", true); + return TFormat(config); +} + +TFormat TFormat::Json() +{ + return TFormat(TNode("json")); +} + +bool TFormat::IsTextYson() const +{ + if (!Config.IsString() || Config.AsString() != "yson") { + return false; + } + if (!Config.HasAttributes()) { + return false; + } + const auto& attributes = Config.GetAttributes(); + if (!attributes.HasKey("format") || attributes["format"] != TNode("text")) { + return false; + } + return true; +} + +bool TFormat::IsProtobuf() const +{ + return Config.IsString() && Config.AsString() == "protobuf"; +} + +bool TFormat::IsYamredDsv() const +{ + return Config.IsString() && Config.AsString() == "yamred_dsv"; +} + +static TString FormatName(const TFormat& format) +{ + if (!format.Config.IsString()) { + Y_VERIFY(format.Config.IsUndefined()); + return "<undefined>"; + } + return format.Config.AsString(); +} + +TYamredDsvAttributes TFormat::GetYamredDsvAttributes() const +{ + if (!IsYamredDsv()) { + ythrow TApiUsageError() << "Cannot get yamred_dsv attributes for " << FormatName(*this) << " format"; + } + TYamredDsvAttributes attributes; + + const auto& nodeAttributes = Config.GetAttributes(); + { + const auto& keyColumns = nodeAttributes["key_column_names"]; + if (!keyColumns.IsList()) { + ythrow yexception() << "Ill-formed format: key_column_names is of non-list type: " << keyColumns.GetType(); + } + for (auto& column : keyColumns.AsList()) { + if (!column.IsString()) { + ythrow yexception() << "Ill-formed format: key_column_names: " << column.GetType(); + } + attributes.KeyColumnNames.push_back(column.AsString()); + } + } + + if (nodeAttributes.HasKey("subkey_column_names")) { + const auto& subkeyColumns = nodeAttributes["subkey_column_names"]; + if (!subkeyColumns.IsList()) { + ythrow yexception() << "Ill-formed format: subkey_column_names is not a list: " << subkeyColumns.GetType(); + } + for (const auto& column : subkeyColumns.AsList()) { + if (!column.IsString()) { + ythrow yexception() << "Ill-formed format: non-string inside subkey_key_column_names: " << column.GetType(); + } + attributes.SubkeyColumnNames.push_back(column.AsString()); + } + } + + return attributes; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/format.h b/yt/cpp/mapreduce/interface/format.h new file mode 100644 index 0000000000..e297576464 --- /dev/null +++ b/yt/cpp/mapreduce/interface/format.h @@ -0,0 +1,122 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/format.h +/// +/// Header containing class to work with raw [YT formats](https://yt.yandex-team.ru/docs/description/storage/formats.html). + +#include "node.h" + +#include <google/protobuf/descriptor.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// @deprecated +struct TYamredDsvAttributes +{ + /// Names of key columns. + TVector<TString> KeyColumnNames; + + /// Names of subkey columns. + TVector<TString> SubkeyColumnNames; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Class representing YT data format. +/// +/// Normally the user does not need to use it. +/// However, the class is handy for "raw" operations and table reading and writing, +/// e.g. @ref NYT::IOperationClient::RawMap and other raw operations, +/// @ref NYT::IIOClient::CreateRawReader and @ref NYT::IIOClient::CreateRawWriter. +/// Anyway, the static factory methods should be preferred to the constructor. +/// +/// @see [YT doc](https://yt.yandex-team.ru/docs/description/storage/formats.html). +struct TFormat +{ +public: + /// Format representation understandable by YT. + TNode Config; + +public: + /// @brief Construct format from given YT format representation. + /// + /// @note Prefer using static factory methods (e.g. @ref NYT::TFormat::YsonBinary, @ref NYT::TFormat::YsonText, @ref NYT::TFormat::Protobuf). + explicit TFormat(const TNode& config = TNode()); + + /// @brief Create text YSON format. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON) + static TFormat YsonText(); + + /// @brief Create binary YSON format. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON) + static TFormat YsonBinary(); + + /// @brief Create YaMR format. + /// + /// @deprecated + static TFormat YaMRLenval(); + + /// @brief Create protobuf format from protobuf message descriptors. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html). + static TFormat Protobuf( + const TVector<const ::google::protobuf::Descriptor*>& descriptors, + bool withDescriptors = false); + + /// @brief Create JSON format. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#JSON) + static TFormat Json(); + + /// @brief Create protobuf format for the message specified in template parameter. + /// + /// `T` must be inherited from `Message`. + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html). + template<typename T> + static inline TFormat Protobuf(bool withDescriptors = false); + + /// @brief Is the format text YSON? + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/description/storage/formats.html#YSON) + bool IsTextYson() const; + + /// @brief Is the format protobuf? + /// + /// @see [the doc](https://yt.yandex-team.ru/docs/api/c++/protobuf.html) + bool IsProtobuf() const; + + /// @brief Is the format YaMR? + /// + /// @deprecated + bool IsYamredDsv() const; + + /// @brief For YAMR format returns its attributes in structured way. + /// + /// @deprecated + TYamredDsvAttributes GetYamredDsvAttributes() const; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template<typename T> +TFormat TFormat::Protobuf(bool withDescriptors) { + return TFormat::Protobuf({T::descriptor()}, withDescriptors); +} + +/// @brief Create table schema from protobuf message descriptor. +/// +/// @param messageDescriptor Message descriptor +/// @param keepFieldsWithoutExtension Add to schema fields without "column_name" or "key_column_name" extensions. +TTableSchema CreateTableSchema( + const ::google::protobuf::Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/format_ut.cpp b/yt/cpp/mapreduce/interface/format_ut.cpp new file mode 100644 index 0000000000..069c29087d --- /dev/null +++ b/yt/cpp/mapreduce/interface/format_ut.cpp @@ -0,0 +1,235 @@ +#include "common.h" +#include "errors.h" +#include "format.h" +#include "common_ut.h" + +#include <yt/cpp/mapreduce/interface/proto3_ut.pb.h> +#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +static TNode GetColumns(const TFormat& format, int tableIndex = 0) +{ + return format.Config.GetAttributes()["tables"][tableIndex]["columns"]; +} + +Y_UNIT_TEST_SUITE(ProtobufFormat) +{ + Y_UNIT_TEST(TIntegral) + { + const auto format = TFormat::Protobuf<NUnitTesting::TIntegral>(); + auto columns = GetColumns(format); + + struct TColumn + { + TString Name; + TString ProtoType; + int FieldNumber; + }; + + auto expected = TVector<TColumn>{ + {"DoubleField", "double", 1}, + {"FloatField", "float", 2}, + {"Int32Field", "int32", 3}, + {"Int64Field", "int64", 4}, + {"Uint32Field", "uint32", 5}, + {"Uint64Field", "uint64", 6}, + {"Sint32Field", "sint32", 7}, + {"Sint64Field", "sint64", 8}, + {"Fixed32Field", "fixed32", 9}, + {"Fixed64Field", "fixed64", 10}, + {"Sfixed32Field", "sfixed32", 11}, + {"Sfixed64Field", "sfixed64", 12}, + {"BoolField", "bool", 13}, + {"EnumField", "enum_string", 14}, + }; + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), expected.size()); + for (int i = 0; i < static_cast<int>(columns.Size()); ++i) { + UNIT_ASSERT_VALUES_EQUAL(columns[i]["name"], expected[i].Name); + UNIT_ASSERT_VALUES_EQUAL(columns[i]["proto_type"], expected[i].ProtoType); + UNIT_ASSERT_VALUES_EQUAL(columns[i]["field_number"], expected[i].FieldNumber); + } + } + + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto format = TFormat::Protobuf<NUnitTesting::TRowFieldSerializationOption>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "UrlRow_1"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + const auto& fields = columns[0]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "Host"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "Path"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["field_number"], 2); + + UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "HttpCode"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "sint32"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["field_number"], 3); + + UNIT_ASSERT_VALUES_EQUAL(columns[1]["name"], "UrlRow_2"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["proto_type"], "message"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["field_number"], 2); + } + + Y_UNIT_TEST(Packed) + { + const auto format = TFormat::Protobuf<NUnitTesting::TPacked>(); + auto column = GetColumns(format)[0]; + + UNIT_ASSERT_VALUES_EQUAL(column["name"], "PackedListInt64"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["field_number"], 1); + UNIT_ASSERT_VALUES_EQUAL(column["packed"], true); + UNIT_ASSERT_VALUES_EQUAL(column["repeated"], true); + } + + Y_UNIT_TEST(Cyclic) + { + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TA>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TB>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TC>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(TFormat::Protobuf<NUnitTesting::TCyclic::TD>(), TApiUsageError); + + const auto format = TFormat::Protobuf<NUnitTesting::TCyclic::TE>(); + auto column = GetColumns(format)[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "d"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "message"); + UNIT_ASSERT_VALUES_EQUAL(column["field_number"], 1); + } + + Y_UNIT_TEST(Map) + { + const auto format = TFormat::Protobuf<NUnitTesting::TWithMap>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 5); + { + const auto& column = columns[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDefault"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "message"); + } + { + const auto& column = columns[1]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapListOfStructsLegacy"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "message"); + } + { + const auto& column = columns[2]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapListOfStructs"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + { + const auto& column = columns[3]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapOptionalDict"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + { + const auto& column = columns[4]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDict"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + } + + Y_UNIT_TEST(Oneof) + { + const auto format = TFormat::Protobuf<NUnitTesting::TWithOneof>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 4); + auto check = [] (const TNode& column, TStringBuf name, TStringBuf oneof2Name) { + UNIT_ASSERT_VALUES_EQUAL(column["name"], name); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 5); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "field"); + + const auto& oneof2 = column["fields"][1]; + UNIT_ASSERT_VALUES_EQUAL(oneof2["name"], oneof2Name); + UNIT_ASSERT_VALUES_EQUAL(oneof2["proto_type"], "oneof"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][0]["name"], "y2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["name"], "z2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["proto_type"], "structured_message"); + const auto& embeddedOneof = oneof2["fields"][1]["fields"][0]; + UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["name"], "Oneof"); + UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["fields"][0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(embeddedOneof["fields"][1]["name"], "y"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][2]["name"], "x2"); + + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][3]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][4]["name"], "z1"); + }; + + check(columns[0], "DefaultSeparateFields", "variant_field_name"); + check(columns[1], "NoDefault", "Oneof2"); + + { + const auto& column = columns[2]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "SerializationProtobuf"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 3); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "z1"); + } + { + const auto& column = columns[3]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "TopLevelOneof"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "oneof"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 1); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "MemberOfTopLevelOneof"); + } + } +} + +Y_UNIT_TEST_SUITE(Proto3) +{ + Y_UNIT_TEST(TWithOptional) + { + const auto format = TFormat::Protobuf<NTestingProto3::TWithOptional>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + } + + Y_UNIT_TEST(TWithOptionalMessage) + { + const auto format = TFormat::Protobuf<NTestingProto3::TWithOptionalMessage>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"].Size(), 1); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["fields"][0]["field_number"], 1); + } +} diff --git a/yt/cpp/mapreduce/interface/fwd.h b/yt/cpp/mapreduce/interface/fwd.h new file mode 100644 index 0000000000..0434c03d8b --- /dev/null +++ b/yt/cpp/mapreduce/interface/fwd.h @@ -0,0 +1,397 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/fwd.h +/// +/// Header containing mostly forward declarations of types. + + +#include <util/generic/fwd.h> +#include <util/system/types.h> + +#include <variant> + +/// @cond Doxygen_Suppress +namespace google::protobuf { + class Message; +} + +namespace NYT { + + //////////////////////////////////////////////////////////////////////////////// + // batch_request.h + //////////////////////////////////////////////////////////////////////////////// + + class IBatchRequest; + using TBatchRequestPtr = ::TIntrusivePtr<IBatchRequest>; + + //////////////////////////////////////////////////////////////////////////////// + // client.h + //////////////////////////////////////////////////////////////////////////////// + + enum ELockMode : int; + + struct TStartTransactionOptions; + + struct TLockOptions; + + template <class TDerived> + struct TTabletOptions; + + struct TMountTableOptions; + + struct TUnmountTableOptions; + + struct TRemountTableOptions; + + struct TReshardTableOptions; + + struct TAlterTableOptions; + + struct TLookupRowsOptions; + + struct TSelectRowsOptions; + + struct TCreateClientOptions; + + struct TAlterTableReplicaOptions; + + struct TGetFileFromCacheOptions; + + struct TPutFileToCacheOptions; + + struct TCheckPermissionResult; + struct TCheckPermissionResponse; + struct TCheckPermissionOptions; + + struct TTabletInfo; + + class ILock; + using ILockPtr = ::TIntrusivePtr<ILock>; + + class ITransaction; + using ITransactionPtr = ::TIntrusivePtr<ITransaction>; + + class ITransactionPinger; + using ITransactionPingerPtr = ::TIntrusivePtr<ITransactionPinger>; + + struct IOperation; + using IOperationPtr = ::TIntrusivePtr<IOperation>; + + class IClientBase; + + class IClient; + + using IClientPtr = ::TIntrusivePtr<IClient>; + using IClientBasePtr = ::TIntrusivePtr<IClientBase>; + + //////////////////////////////////////////////////////////////////////////////// + // config.h + //////////////////////////////////////////////////////////////////////////////// + + struct TConfig; + using TConfigPtr = ::TIntrusivePtr<TConfig>; + + //////////////////////////////////////////////////////////////////////////////// + // cypress.h + //////////////////////////////////////////////////////////////////////////////// + + enum ENodeType : int; + + struct TCreateOptions; + + struct TRemoveOptions; + + struct TGetOptions; + + struct TSetOptions; + + struct TMultisetAttributesOptions; + + struct TListOptions; + + struct TCopyOptions; + + struct TMoveOptions; + + struct TLinkOptions; + + struct TConcatenateOptions; + + struct TInsertRowsOptions; + + struct TDeleteRowsOptions; + + struct TTrimRowsOptions; + + class ICypressClient; + + //////////////////////////////////////////////////////////////////////////////// + // errors.h + //////////////////////////////////////////////////////////////////////////////// + + class TApiUsageError; + + class TYtError; + + class TErrorResponse; + + struct TFailedJobInfo; + + class TOperationFailedError; + + //////////////////////////////////////////////////////////////////////////////// + // node.h + //////////////////////////////////////////////////////////////////////////////// + + class TNode; + + //////////////////////////////////////////////////////////////////////////////// + // common.h + //////////////////////////////////////////////////////////////////////////////// + + using TTransactionId = TGUID; + using TNodeId = TGUID; + using TLockId = TGUID; + using TOperationId = TGUID; + using TTabletCellId = TGUID; + using TReplicaId = TGUID; + using TJobId = TGUID; + + using TYPath = TString; + using TLocalFilePath = TString; + + template <class T, class TDerived = void> + struct TOneOrMany; + + // key column values + using TKey = TOneOrMany<TNode>; + + class TSortColumn; + + // column names + using TColumnNames = TOneOrMany<TString>; + + // key column descriptors. + class TSortColumns; + + enum EValueType : int; + + enum ESortOrder : int; + + enum EOptimizeForAttr : i8; + + enum EErasureCodecAttr : i8; + + enum ESchemaModificationAttr : i8; + + enum class EMasterReadKind : int; + + class TColumnSchema; + + class TTableSchema; + + enum class ERelation; + + struct TKeyBound; + + struct TReadLimit; + + struct TReadRange; + + struct TRichYPath; + + struct TAttributeFilter; + + //////////////////////////////////////////////////////////////////////////////// + // io.h + //////////////////////////////////////////////////////////////////////////////// + + enum class EFormatType : int; + + struct TFormat; + + class IFileReader; + + using IFileReaderPtr = ::TIntrusivePtr<IFileReader>; + + class IFileWriter; + + using IFileWriterPtr = ::TIntrusivePtr<IFileWriter>; + + class IBlobTableReader; + using IBlobTableReaderPtr = ::TIntrusivePtr<IBlobTableReader>; + + class TRawTableReader; + + using TRawTableReaderPtr = ::TIntrusivePtr<TRawTableReader>; + + class TRawTableWriter; + + using TRawTableWriterPtr = ::TIntrusivePtr<TRawTableWriter>; + + template <class T, class = void> + class TTableReader; + + template <class T, class = void> + class TTableRangesReader; + + template <typename T> + using TTableRangesReaderPtr = ::TIntrusivePtr<TTableRangesReader<T>>; + + template <class T> + using TTableReaderPtr = ::TIntrusivePtr<TTableReader<T>>; + + template <class T, class = void> + class TTableWriter; + + template <class T> + using TTableWriterPtr = ::TIntrusivePtr<TTableWriter<T>>; + + struct TYaMRRow; + + using ::google::protobuf::Message; + + class ISkiffRowParser; + + using ISkiffRowParserPtr = ::TIntrusivePtr<ISkiffRowParser>; + + class ISkiffRowSkipper; + + using ISkiffRowSkipperPtr = ::TIntrusivePtr<ISkiffRowSkipper>; + + namespace NDetail { + + class TYdlGenericRowType; + + } // namespace NDetail + + template<class... TYdlRowTypes> + class TYdlOneOf; + + template<class... TProtoRowTypes> + class TProtoOneOf; + + template<class... TSkiffRowTypes> + class TSkiffRowOneOf; + + using TYaMRReader = TTableReader<TYaMRRow>; + using TYaMRWriter = TTableWriter<TYaMRRow>; + using TNodeReader = TTableReader<TNode>; + using TNodeWriter = TTableWriter<TNode>; + using TMessageReader = TTableReader<Message>; + using TMessageWriter = TTableWriter<Message>; + using TYdlTableWriter = TTableWriter<NDetail::TYdlGenericRowType>; + + template <class TDerived> + struct TIOOptions; + + struct TFileReaderOptions; + + struct TFileWriterOptions; + + struct TTableReaderOptions; + + class TSkiffRowHints; + + struct TTableWriterOptions; + + //////////////////////////////////////////////////////////////////////////////// + // job_statistics.h + //////////////////////////////////////////////////////////////////////////////// + + class TJobStatistics; + + template <typename T> + class TJobStatisticsEntry; + + //////////////////////////////////////////////////////////////////////////////// + // operation.h + //////////////////////////////////////////////////////////////////////////////// + + class TFormatHints; + + struct TUserJobSpec; + + struct TMapOperationSpec; + + struct TRawMapOperationSpec; + + struct TReduceOperationSpec; + + struct TMapReduceOperationSpec; + + struct TJoinReduceOperationSpec; + + struct TSortOperationSpec; + + class IIOperationPreparationContext; + + class IJob; + using IJobPtr = ::TIntrusivePtr<IJob>; + + class IRawJob; + using IRawJobPtr = ::TIntrusivePtr<IRawJob>; + + enum EMergeMode : int; + + struct TMergeOperationSpec; + + struct TEraseOperationSpec; + + template <class TR, class TW> + class IMapper; + + template <class TR, class TW> + class IReducer; + + template <class TR, class TW> + class IAggregatorReducer; + + struct TSuspendOperationOptions; + + struct TResumeOperationOptions; + + enum class EOperationBriefState : int; + + struct TOperationAttributes; + + struct TOperationOptions; + + enum class EOperationAttribute : int; + + struct TOperationAttributeFilter; + + struct TGetOperationOptions; + + struct TListOperationsOptions; + + struct TGetJobOptions; + + struct TListJobsOptions; + + struct IOperationClient; + + enum class EFinishedJobState : int; + + enum class EJobType : int; + enum class EJobState : int; + enum class ETaskName : int; + class TTaskName; + + struct TJobBinaryDefault; + + struct TJobBinaryLocalPath; + + struct TJobBinaryCypressPath; + + using TJobBinaryConfig = std::variant< + TJobBinaryDefault, + TJobBinaryLocalPath, + TJobBinaryCypressPath>; + + struct TRetryConfig; + class IRetryConfigProvider; + using IRetryConfigProviderPtr = ::TIntrusivePtr<IRetryConfigProvider>; +} +/// @endcond diff --git a/yt/cpp/mapreduce/interface/init.h b/yt/cpp/mapreduce/interface/init.h new file mode 100644 index 0000000000..302be268fc --- /dev/null +++ b/yt/cpp/mapreduce/interface/init.h @@ -0,0 +1,71 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/init.h +/// +/// Initialization functions of YT Wrapper. + +#include <yt/cpp/mapreduce/interface/wait_proxy.h> + +#include <util/generic/fwd.h> + +#include <functional> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Options for @ref NYT::Initialize() and @ref NYT::JoblessInitialize() functions +struct TInitializeOptions +{ + using TSelf = TInitializeOptions; + + /// + /// @brief Override waiting functions for YT Wrapper. + /// + /// This options allows to override functions used by this library to wait something. + FLUENT_FIELD_DEFAULT(::TIntrusivePtr<IWaitProxy>, WaitProxy, nullptr); + + /// + /// @brief Enable/disable cleanup when program execution terminates abnormally. + /// + /// When set to true, library will abort all active transactions and running operations when program + /// terminates on error or signal. + FLUENT_FIELD_DEFAULT(bool, CleanupOnTermination, false); + + /// + /// @brief Set callback to be called before exit() in job mode. + /// + /// Provided function will be called just before exit() when program is started in job mode. + /// This might be useful for shutting down libraries that are used inside operations. + /// + /// NOTE: Keep in mind that inside job execution environment differs from client execution environment. + /// So JobOnExitFunction should not depend on argc/argv environment variables etc. + FLUENT_FIELD_OPTION(std::function<void()>, JobOnExitFunction); +}; + +/// +/// @brief Performs basic initialization (logging, termination handlers, etc). +/// +/// This function never switches to job mode. +void JoblessInitialize(const TInitializeOptions& options = TInitializeOptions()); + +/// +/// @brief Performs basic initialization and switches to a job mode if required. +/// +/// This function performs basic initialization (it sets up logging reads the config, etc) and checks if binary is launched +/// on YT machine inside a job. If latter is true this function launches proper job and after job is done it calls exit(). +/// +/// This function must be called if application starts any operation. +/// This function must be called immediately after entering main() function before any argument parsing is done. +void Initialize(int argc, const char **argv, const TInitializeOptions &options = TInitializeOptions()); + +/// Similar to @ref NYT::Initialize(int, const char**, const TInitializeOptions&) +void Initialize(int argc, char **argv, const TInitializeOptions &options = TInitializeOptions()); + +/// Similar to @ref NYT::Initialize(int, const char**, const TInitializeOptions&) +void Initialize(const TInitializeOptions &options = TInitializeOptions()); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/io-inl.h b/yt/cpp/mapreduce/interface/io-inl.h new file mode 100644 index 0000000000..c35ebb7481 --- /dev/null +++ b/yt/cpp/mapreduce/interface/io-inl.h @@ -0,0 +1,1015 @@ +#pragma once + +#ifndef IO_INL_H_ +#error "Direct inclusion of this file is not allowed, use io.h" +#endif +#undef IO_INL_H_ + +#include "finish_or_die.h" + +#include <util/generic/typetraits.h> +#include <util/generic/yexception.h> +#include <util/stream/length.h> + +#include <util/system/mutex.h> +#include <util/system/spinlock.h> + +#include <library/cpp/yson/node/node_builder.h> + +#include <yt/cpp/mapreduce/interface/serialize.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +template<class T> +struct TIsProtoOneOf + : std::false_type +{ }; + +template <class ...TProtoRowTypes> +struct TIsProtoOneOf<TProtoOneOf<TProtoRowTypes...>> + : std::true_type +{ }; + +template <class T> +struct TIsSkiffRowOneOf + : std::false_type +{ }; + +template <class ...TSkiffRowTypes> +struct TIsSkiffRowOneOf<TSkiffRowOneOf<TSkiffRowTypes...>> + : std::true_type +{ }; + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +template <class T, class = void> +struct TRowTraits; + +template <> +struct TRowTraits<TNode> +{ + using TRowType = TNode; + using IReaderImpl = INodeReaderImpl; + using IWriterImpl = INodeWriterImpl; +}; + +template <> +struct TRowTraits<TYaMRRow> +{ + using TRowType = TYaMRRow; + using IReaderImpl = IYaMRReaderImpl; + using IWriterImpl = IYaMRWriterImpl; +}; + +template <> +struct TRowTraits<Message> +{ + using TRowType = Message; + using IReaderImpl = IProtoReaderImpl; + using IWriterImpl = IProtoWriterImpl; +}; + +template <class T> +struct TRowTraits<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> +{ + using TRowType = T; + using IReaderImpl = IProtoReaderImpl; + using IWriterImpl = IProtoWriterImpl; +}; + +template <class T> +struct TRowTraits<T, std::enable_if_t<TIsSkiffRow<T>::value>> +{ + using TRowType = T; + using IReaderImpl = ISkiffRowReaderImpl; +}; + +template <class... TSkiffRowTypes> +struct TRowTraits<TSkiffRowOneOf<TSkiffRowTypes...>> +{ + using TRowType = TSkiffRowOneOf<TSkiffRowTypes...>; + using IReaderImpl = ISkiffRowReaderImpl; +}; + +template <class... TProtoRowTypes> +struct TRowTraits<TProtoOneOf<TProtoRowTypes...>> +{ + using TRowType = TProtoOneOf<TProtoRowTypes...>; + using IReaderImpl = IProtoReaderImpl; + using IWriterImpl = IProtoWriterImpl; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct IReaderImplBase + : public TThrRefBase +{ + virtual bool IsValid() const = 0; + virtual void Next() = 0; + virtual ui32 GetTableIndex() const = 0; + virtual ui32 GetRangeIndex() const = 0; + virtual ui64 GetRowIndex() const = 0; + virtual void NextKey() = 0; + + // Not pure virtual because of clients that has already implemented this interface. + virtual TMaybe<size_t> GetReadByteCount() const; + virtual i64 GetTabletIndex() const; + virtual bool IsEndOfStream() const; + virtual bool IsRawReaderExhausted() const; +}; + +struct INodeReaderImpl + : public IReaderImplBase +{ + virtual const TNode& GetRow() const = 0; + virtual void MoveRow(TNode* row) = 0; +}; + +struct IYaMRReaderImpl + : public IReaderImplBase +{ + virtual const TYaMRRow& GetRow() const = 0; + virtual void MoveRow(TYaMRRow* row) + { + *row = GetRow(); + } +}; + +struct IProtoReaderImpl + : public IReaderImplBase +{ + virtual void ReadRow(Message* row) = 0; +}; + +struct ISkiffRowReaderImpl + : public IReaderImplBase +{ + virtual void ReadRow(const ISkiffRowParserPtr& parser) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +// We don't include <yt/cpp/mapreduce/interface/logging/yt_log.h> in this file +// to avoid macro name clashes (specifically YT_LOG_DEBUG) +void LogTableReaderStatistics(ui64 rowCount, TMaybe<size_t> byteCount); + +template <class T> +class TTableReaderBase + : public TThrRefBase +{ +public: + using TRowType = typename TRowTraits<T>::TRowType; + using IReaderImpl = typename TRowTraits<T>::IReaderImpl; + + explicit TTableReaderBase(::TIntrusivePtr<IReaderImpl> reader) + : Reader_(reader) + { } + + ~TTableReaderBase() override + { + NDetail::LogTableReaderStatistics(ReadRowCount_, Reader_->GetReadByteCount()); + } + + bool IsValid() const + { + return Reader_->IsValid(); + } + + void Next() + { + Reader_->Next(); + ++ReadRowCount_; + RowState_ = ERowState::None; + } + + bool IsEndOfStream() + { + return Reader_->IsEndOfStream(); + } + + bool IsRawReaderExhausted() + { + return Reader_->IsRawReaderExhausted(); + } + + ui32 GetTableIndex() const + { + return Reader_->GetTableIndex(); + } + + ui32 GetRangeIndex() const + { + return Reader_->GetRangeIndex(); + } + + ui64 GetRowIndex() const + { + return Reader_->GetRowIndex(); + } + + i64 GetTabletIndex() const + { + return Reader_->GetTabletIndex(); + } + +protected: + template <typename TCacher, typename TCacheGetter> + const auto& DoGetRowCached(TCacher cacher, TCacheGetter cacheGetter) const + { + switch (RowState_) { + case ERowState::None: + cacher(); + RowState_ = ERowState::Cached; + break; + case ERowState::Cached: + break; + case ERowState::MovedOut: + ythrow yexception() << "Row is already moved"; + } + return *cacheGetter(); + } + + template <typename U, typename TMover, typename TCacheMover> + void DoMoveRowCached(U* result, TMover mover, TCacheMover cacheMover) + { + Y_VERIFY(result); + switch (RowState_) { + case ERowState::None: + mover(result); + break; + case ERowState::Cached: + cacheMover(result); + break; + case ERowState::MovedOut: + ythrow yexception() << "Row is already moved"; + } + RowState_ = ERowState::MovedOut; + } + +private: + enum class ERowState + { + None, + Cached, + MovedOut, + }; + +protected: + ::TIntrusivePtr<IReaderImpl> Reader_; + +private: + ui64 ReadRowCount_ = 0; + mutable ERowState RowState_ = ERowState::None; +}; + +template <class T> +class TSimpleTableReader + : public TTableReaderBase<T> +{ +public: + using TBase = TTableReaderBase<T>; + using typename TBase::TRowType; + + using TBase::TBase; + + const TRowType& GetRow() const + { + // Caching is implemented in underlying reader. + return TBase::DoGetRowCached( + /* cacher */ [&] {}, + /* cacheGetter */ [&] { + return &Reader_->GetRow(); + }); + } + + void MoveRow(TRowType* result) + { + // Caching is implemented in underlying reader. + TBase::DoMoveRowCached( + result, + /* mover */ [&] (TRowType* result) { + Reader_->MoveRow(result); + }, + /* cacheMover */ [&] (TRowType* result) { + Reader_->MoveRow(result); + }); + } + + TRowType MoveRow() + { + TRowType result; + MoveRow(&result); + return result; + } + +private: + using TBase::Reader_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail + +template <> +class TTableReader<TNode> + : public NDetail::TSimpleTableReader<TNode> +{ + using TSimpleTableReader<TNode>::TSimpleTableReader; +}; + +template <> +class TTableReader<TYaMRRow> + : public NDetail::TSimpleTableReader<TYaMRRow> +{ + using TSimpleTableReader<TYaMRRow>::TSimpleTableReader; +}; + +template <> +class TTableReader<Message> + : public NDetail::TTableReaderBase<Message> +{ +public: + using TBase = NDetail::TTableReaderBase<Message>; + + using TBase::TBase; + + template <class U> + const U& GetRow() const + { + static_assert(TIsBaseOf<Message, U>::Value); + + return TBase::DoGetRowCached( + /* cacher */ [&] { + CachedRow_.Reset(new U); + Reader_->ReadRow(CachedRow_.Get()); + }, + /* cacheGetter */ [&] { + auto result = dynamic_cast<const U*>(CachedRow_.Get()); + Y_VERIFY(result); + return result; + }); + } + + template <class U> + void MoveRow(U* result) + { + static_assert(TIsBaseOf<Message, U>::Value); + + TBase::DoMoveRowCached( + result, + /* mover */ [&] (U* result) { + Reader_->ReadRow(result); + }, + /* cacheMover */ [&] (U* result) { + auto cast = dynamic_cast<U*>(CachedRow_.Get()); + Y_VERIFY(cast); + result->Swap(cast); + }); + } + + template <class U> + U MoveRow() + { + static_assert(TIsBaseOf<Message, U>::Value); + + U result; + MoveRow(&result); + return result; + } + + ::TIntrusivePtr<IProtoReaderImpl> GetReaderImpl() const + { + return Reader_; + } + +private: + using TBase::Reader_; + mutable THolder<Message> CachedRow_; +}; + +template<class... TProtoRowTypes> +class TTableReader<TProtoOneOf<TProtoRowTypes...>> + : public NDetail::TTableReaderBase<TProtoOneOf<TProtoRowTypes...>> +{ +public: + using TBase = NDetail::TTableReaderBase<TProtoOneOf<TProtoRowTypes...>>; + + using TBase::TBase; + + template <class U> + const U& GetRow() const + { + AssertIsOneOf<U>(); + return TBase::DoGetRowCached( + /* cacher */ [&] { + Reader_->ReadRow(&std::get<U>(CachedRows_)); + CachedIndex_ = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value; + }, + /* cacheGetter */ [&] { + return &std::get<U>(CachedRows_); + }); + } + + template <class U> + void MoveRow(U* result) + { + AssertIsOneOf<U>(); + return TBase::DoMoveRowCached( + result, + /* mover */ [&] (U* result) { + Reader_->ReadRow(result); + }, + /* cacheMover */ [&] (U* result) { + Y_VERIFY((NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value) == CachedIndex_); + *result = std::move(std::get<U>(CachedRows_)); + }); + } + + template <class U> + U MoveRow() + { + U result; + MoveRow(&result); + return result; + } + + ::TIntrusivePtr<IProtoReaderImpl> GetReaderImpl() const + { + return Reader_; + } + +private: + using TBase::Reader_; + // std::variant could also be used here, but std::tuple leads to better performance + // because of deallocations that std::variant has to do + mutable std::tuple<TProtoRowTypes...> CachedRows_; + mutable int CachedIndex_; + + template <class U> + static constexpr void AssertIsOneOf() + { + static_assert( + (std::is_same<U, TProtoRowTypes>::value || ...), + "Template parameter must be one of TProtoOneOf template parameter"); + } +}; + +template <class T> +class TTableReader<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> + : public TTableReader<TProtoOneOf<T>> +{ +public: + using TRowType = T; + using TBase = TTableReader<TProtoOneOf<T>>; + + using TBase::TBase; + + const T& GetRow() const + { + return TBase::template GetRow<T>(); + } + + void MoveRow(T* result) + { + TBase::template MoveRow<T>(result); + } + + T MoveRow() + { + return TBase::template MoveRow<T>(); + } +}; + +template<class... TSkiffRowTypes> +class TTableReader<TSkiffRowOneOf<TSkiffRowTypes...>> + : public NDetail::TTableReaderBase<TSkiffRowOneOf<TSkiffRowTypes...>> +{ +public: + using TBase = NDetail::TTableReaderBase<TSkiffRowOneOf<TSkiffRowTypes...>>; + + using TBase::TBase; + + explicit TTableReader(::TIntrusivePtr<typename TBase::IReaderImpl> reader, const TMaybe<TSkiffRowHints>& hints) + : TBase(reader) + , Parsers_({(CreateSkiffParser<TSkiffRowTypes>(&std::get<TSkiffRowTypes>(CachedRows_), hints))...}) + { } + + template <class U> + const U& GetRow() const + { + AssertIsOneOf<U>(); + return TBase::DoGetRowCached( + /* cacher */ [&] { + auto index = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value; + Reader_->ReadRow(Parsers_[index]); + CachedIndex_ = index; + }, + /* cacheGetter */ [&] { + return &std::get<U>(CachedRows_); + }); + } + + template <class U> + void MoveRow(U* result) + { + AssertIsOneOf<U>(); + return TBase::DoMoveRowCached( + result, + /* mover */ [&] (U* result) { + auto index = NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value; + Reader_->ReadRow(Parsers_[index]); + *result = std::move(std::get<U>(CachedRows_)); + }, + /* cacheMover */ [&] (U* result) { + Y_VERIFY((NDetail::TIndexInTuple<U, decltype(CachedRows_)>::Value) == CachedIndex_); + *result = std::move(std::get<U>(CachedRows_)); + }); + } + + template <class U> + U MoveRow() + { + U result; + MoveRow(&result); + return result; + } + + ::TIntrusivePtr<ISkiffRowReaderImpl> GetReaderImpl() const + { + return Reader_; + } + +private: + using TBase::Reader_; + // std::variant could also be used here, but std::tuple leads to better performance + // because of deallocations that std::variant has to do + mutable std::tuple<TSkiffRowTypes...> CachedRows_; + mutable std::vector<ISkiffRowParserPtr> Parsers_; + mutable int CachedIndex_; + + template <class U> + static constexpr void AssertIsOneOf() + { + static_assert( + (std::is_same<U, TSkiffRowTypes>::value || ...), + "Template parameter must be one of TSkiffRowOneOf template parameter"); + } +}; + +template <class T> +class TTableReader<T, std::enable_if_t<TIsSkiffRow<T>::value>> + : public TTableReader<TSkiffRowOneOf<T>> +{ +public: + using TRowType = T; + using TBase = TTableReader<TSkiffRowOneOf<T>>; + + using TBase::TBase; + + const T& GetRow() + { + return TBase::template GetRow<T>(); + } + + void MoveRow(T* result) + { + TBase::template MoveRow<T>(result); + } + + T MoveRow() + { + return TBase::template MoveRow<T>(); + } +}; + +template <> +inline TTableReaderPtr<TNode> IIOClient::CreateTableReader<TNode>( + const TRichYPath& path, const TTableReaderOptions& options) +{ + return new TTableReader<TNode>(CreateNodeReader(path, options)); +} + +template <> +inline TTableReaderPtr<TYaMRRow> IIOClient::CreateTableReader<TYaMRRow>( + const TRichYPath& path, const TTableReaderOptions& options) +{ + return new TTableReader<TYaMRRow>(CreateYaMRReader(path, options)); +} + +template <class T, class = std::enable_if_t<TIsBaseOf<Message, T>::Value>> +struct TReaderCreator +{ + static TTableReaderPtr<T> Create(::TIntrusivePtr<IProtoReaderImpl> reader) + { + return new TTableReader<T>(reader); + } +}; + +template <class T> +inline TTableReaderPtr<T> IIOClient::CreateTableReader( + const TRichYPath& path, const TTableReaderOptions& options) +{ + if constexpr (TIsBaseOf<Message, T>::Value) { + TAutoPtr<T> prototype(new T); + return new TTableReader<T>(CreateProtoReader(path, options, prototype.Get())); + } else if constexpr (TIsSkiffRow<T>::value) { + const auto& hints = options.FormatHints_ ? options.FormatHints_->SkiffRowHints_ : Nothing(); + auto schema = GetSkiffSchema<T>(hints); + auto skipper = CreateSkiffSkipper<T>(hints); + return new TTableReader<T>(CreateSkiffRowReader(path, options, skipper, schema), hints); + } else { + static_assert(TDependentFalse<T>, "Unsupported type for table reader"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +TTableReaderPtr<T> CreateTableReader( + IInputStream* stream, + const TTableReaderOptions& options) +{ + return TReaderCreator<T>::Create(NDetail::CreateProtoReader(stream, options, T::descriptor())); +} + +template <class... Ts> +TTableReaderPtr<typename NDetail::TProtoOneOfUnique<Ts...>::TType> CreateProtoMultiTableReader( + IInputStream* stream, + const TTableReaderOptions& options) +{ + return new TTableReader<typename NDetail::TProtoOneOfUnique<Ts...>::TType>( + NDetail::CreateProtoReader(stream, options, {Ts::descriptor()...})); +} + +template <class T> +TTableReaderPtr<T> CreateProtoMultiTableReader( + IInputStream* stream, + int tableCount, + const TTableReaderOptions& options) +{ + static_assert(TIsBaseOf<::google::protobuf::Message, T>::Value); + TVector<const ::google::protobuf::Descriptor*> descriptors(tableCount, T::descriptor()); + return new TTableReader<T>(NDetail::CreateProtoReader(stream, options, std::move(descriptors))); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +class TTableRangesReader<T> + : public TThrRefBase +{ +public: + using TRowType = T; + +private: + using TReaderImpl = typename TRowTraits<TRowType>::IReaderImpl; + +public: + TTableRangesReader(::TIntrusivePtr<TReaderImpl> readerImpl) + : ReaderImpl_(readerImpl) + , Reader_(MakeIntrusive<TTableReader<TRowType>>(readerImpl)) + , IsValid_(Reader_->IsValid()) + { } + + TTableReader<T>& GetRange() + { + return *Reader_; + } + + bool IsValid() const + { + return IsValid_; + } + + void Next() + { + ReaderImpl_->NextKey(); + if ((IsValid_ = Reader_->IsValid())) { + Reader_->Next(); + } + } + +private: + ::TIntrusivePtr<TReaderImpl> ReaderImpl_; + ::TIntrusivePtr<TTableReader<TRowType>> Reader_; + bool IsValid_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +struct IWriterImplBase + : public TThrRefBase +{ + virtual void AddRow(const T& row, size_t tableIndex) = 0; + + virtual void AddRow(const T& row, size_t tableIndex, size_t /*rowWeight*/) + { + AddRow(row, tableIndex); + } + + virtual void AddRow(T&& row, size_t tableIndex) = 0; + + virtual void AddRow(T&& row, size_t tableIndex, size_t /*rowWeight*/) + { + AddRow(std::move(row), tableIndex); + } + + virtual void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex, size_t rowBatchWeight = 0) + { + for (const auto& row : rowBatch) { + AddRow(row, tableIndex, rowBatchWeight / rowBatch.size()); + } + } + + virtual void AddRowBatch(TVector<T>&& rowBatch, size_t tableIndex, size_t rowBatchWeight = 0) + { + auto rowBatchSize = rowBatch.size(); + for (auto&& row : std::move(rowBatch)) { + AddRow(std::move(row), tableIndex, rowBatchWeight / rowBatchSize); + } + } + + virtual size_t GetTableCount() const = 0; + virtual void FinishTable(size_t tableIndex) = 0; + virtual void Abort() + { } +}; + +struct INodeWriterImpl + : public IWriterImplBase<TNode> +{ +}; + +struct IYaMRWriterImpl + : public IWriterImplBase<TYaMRRow> +{ +}; + +struct IProtoWriterImpl + : public IWriterImplBase<Message> +{ +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +class TTableWriterBase + : public TThrRefBase +{ +public: + using TRowType = T; + using IWriterImpl = typename TRowTraits<T>::IWriterImpl; + + explicit TTableWriterBase(::TIntrusivePtr<IWriterImpl> writer) + : Writer_(writer) + , Locks_(MakeAtomicShared<TVector<TAdaptiveLock>>(writer->GetTableCount())) + { } + + ~TTableWriterBase() override + { + if (Locks_.RefCount() == 1) { + NDetail::FinishOrDie(this, "TTableWriterBase"); + } + } + + void Abort() + { + Writer_->Abort(); + } + + void AddRow(const T& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + DoAddRow<T>(row, tableIndex, rowWeight); + } + + void AddRow(T&& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + DoAddRow<T>(std::move(row), tableIndex, rowWeight); + } + + void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + DoAddRowBatch<T>(rowBatch, tableIndex, rowBatchWeight); + } + + void AddRowBatch(TVector<T>&& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + DoAddRowBatch<T>(std::move(rowBatch), tableIndex, rowBatchWeight); + } + + void Finish() + { + for (size_t i = 0; i < Locks_->size(); ++i) { + auto guard = Guard((*Locks_)[i]); + Writer_->FinishTable(i); + } + } + +protected: + template <class U> + void DoAddRow(const U& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRow(row, tableIndex, rowWeight); + } + + template <class U> + void DoAddRow(U&& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRow(std::move(row), tableIndex, rowWeight); + } + + template <class U> + void DoAddRowBatch(const TVector<U>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRowBatch(rowBatch, tableIndex, rowBatchWeight); + } + + template <class U> + void DoAddRowBatch(TVector<U>&& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + if (tableIndex >= Locks_->size()) { + ythrow TIOException() << + "Table index " << tableIndex << + " is out of range [0, " << Locks_->size() << ")"; + } + + auto guard = Guard((*Locks_)[tableIndex]); + Writer_->AddRowBatch(std::move(rowBatch), tableIndex, rowBatchWeight); + } + + ::TIntrusivePtr<IWriterImpl> GetWriterImpl() + { + return Writer_; + } + +private: + ::TIntrusivePtr<IWriterImpl> Writer_; + TAtomicSharedPtr<TVector<TAdaptiveLock>> Locks_; +}; + +template <> +class TTableWriter<TNode> + : public TTableWriterBase<TNode> +{ +public: + using TBase = TTableWriterBase<TNode>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } +}; + +template <> +class TTableWriter<TYaMRRow> + : public TTableWriterBase<TYaMRRow> +{ +public: + using TBase = TTableWriterBase<TYaMRRow>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } +}; + +template <> +class TTableWriter<Message> + : public TTableWriterBase<Message> +{ +public: + using TBase = TTableWriterBase<Message>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } + + template <class U, std::enable_if_t<std::is_base_of<Message, U>::value>* = nullptr> + void AddRow(const U& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + TBase::AddRow(row, tableIndex, rowWeight); + } + + template <class U, std::enable_if_t<std::is_base_of<Message, U>::value>* = nullptr> + void AddRowBatch(const TVector<U>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + for (const auto& row : rowBatch) { + AddRow(row, tableIndex, rowBatchWeight / rowBatch.size()); + } + } +}; + +template <class T> +class TTableWriter<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> + : public TTableWriter<Message> +{ +public: + using TRowType = T; + using TBase = TTableWriter<Message>; + + explicit TTableWriter(::TIntrusivePtr<IWriterImpl> writer) + : TBase(writer) + { } + + void AddRow(const T& row, size_t tableIndex = 0, size_t rowWeight = 0) + { + TBase::AddRow<T>(row, tableIndex, rowWeight); + } + + void AddRowBatch(const TVector<T>& rowBatch, size_t tableIndex = 0, size_t rowBatchWeight = 0) + { + TBase::AddRowBatch<T>(rowBatch, tableIndex, rowBatchWeight); + } +}; + +template <> +inline TTableWriterPtr<TNode> IIOClient::CreateTableWriter<TNode>( + const TRichYPath& path, const TTableWriterOptions& options) +{ + return new TTableWriter<TNode>(CreateNodeWriter(path, options)); +} + +template <> +inline TTableWriterPtr<TYaMRRow> IIOClient::CreateTableWriter<TYaMRRow>( + const TRichYPath& path, const TTableWriterOptions& options) +{ + return new TTableWriter<TYaMRRow>(CreateYaMRWriter(path, options)); +} + +template <class T> +inline TTableWriterPtr<T> IIOClient::CreateTableWriter( + const TRichYPath& path, const TTableWriterOptions& options) +{ + if constexpr (TIsBaseOf<Message, T>::Value) { + TAutoPtr<T> prototype(new T); + return new TTableWriter<T>(CreateProtoWriter(path, options, prototype.Get())); + } else { + static_assert(TDependentFalse<T>, "Unsupported type for table writer"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(TTableReader<Message>* reader) +{ + static_assert(std::is_base_of_v<Message, T>, "T must be a protobuf type (either Message or its descendant)"); + Y_ENSURE(reader, "reader must be non-null"); + return ::MakeIntrusive<TTableReader<T>>(reader->GetReaderImpl()); +} + +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(const TTableReaderPtr<Message>& reader) +{ + Y_ENSURE(reader, "reader must be non-null"); + return CreateConcreteProtobufReader<T>(reader.Get()); +} + +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(TTableReader<T>* reader) +{ + static_assert(std::is_base_of_v<Message, T>, "T must be a protobuf type (either Message or its descendant)"); + Y_ENSURE(reader, "reader must be non-null"); + return ::MakeIntrusive<TTableReader<Message>>(reader->GetReaderImpl()); +} + +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(const TTableReaderPtr<T>& reader) +{ + Y_ENSURE(reader, "reader must be non-null"); + return CreateGenericProtobufReader(reader.Get()); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/io.cpp b/yt/cpp/mapreduce/interface/io.cpp new file mode 100644 index 0000000000..f97629721a --- /dev/null +++ b/yt/cpp/mapreduce/interface/io.cpp @@ -0,0 +1,47 @@ +#include "io.h" + +#include <yt/cpp/mapreduce/interface/logging/yt_log.h> + +#include <util/string/cast.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +TMaybe<size_t> IReaderImplBase::GetReadByteCount() const +{ + return Nothing(); +} + +i64 IReaderImplBase::GetTabletIndex() const +{ + Y_FAIL("Unimplemented"); +} + +bool IReaderImplBase::IsEndOfStream() const +{ + Y_FAIL("Unimplemented"); +} + +bool IReaderImplBase::IsRawReaderExhausted() const +{ + Y_FAIL("Unimplemented"); +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +void LogTableReaderStatistics(ui64 rowCount, TMaybe<size_t> byteCount) +{ + TString byteCountStr = (byteCount ? ::ToString(*byteCount) : "<unknown>"); + YT_LOG_DEBUG("Table reader has read %v rows, %v bytes", + rowCount, + byteCountStr); +} + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/io.h b/yt/cpp/mapreduce/interface/io.h new file mode 100644 index 0000000000..e2b20a1802 --- /dev/null +++ b/yt/cpp/mapreduce/interface/io.h @@ -0,0 +1,586 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/io.h +/// +/// Header containing client interface for reading and writing tables and files. + + +#include "fwd.h" + +#include "client_method_options.h" +#include "common.h" +#include "format.h" +#include "node.h" +#include "mpl.h" +#include "skiff_row.h" + +#include <google/protobuf/message.h> + +#include <util/stream/input.h> +#include <util/stream/output.h> +#include <util/generic/yexception.h> +#include <util/generic/maybe.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief "Marker" type to use for several protobuf types in @ref NYT::TTableReader. +/// +/// @tparam Ts Possible types of rows to be read. +template<class... TProtoRowTypes> +class TProtoOneOf +{ +public: + static_assert( + (TIsBaseOf<::google::protobuf::Message, TProtoRowTypes>::Value && ...), + "Template parameters can only be protobuf types"); + + TProtoOneOf() = delete; +}; + +/// +/// @brief "Marker" type to use for several skiff row types in @ref NYT::TTableReader. +/// +/// @tparam Ts Possible types of rows to be read. +template<class... TSkiffRowTypes> +class TSkiffRowOneOf +{ +public: + static_assert( + (TIsSkiffRow<TSkiffRowTypes>::value && ...), + "Template parameters can only be SkiffRow types"); + + TSkiffRowOneOf() = delete; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @cond Doxygen_Suppress +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +template <class TTuple> +struct TProtoOneOfFromTuple; + +template <class... Ts> +struct TProtoOneOfFromTuple<std::tuple<Ts...>> +{ + using TType = TProtoOneOf<Ts...>; +}; + +template <class... Ts> +struct TProtoOneOfUnique +{ + using TTuple = typename TUniqueTypes<std::tuple<>, std::tuple<Ts...>>::TType; + using TType = typename TProtoOneOfFromTuple<TTuple>::TType; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +struct INodeReaderImpl; +struct IYaMRReaderImpl; +struct IProtoReaderImpl; +struct ISkiffRowReaderImpl; +struct INodeWriterImpl; +struct IYaMRWriterImpl; +struct IProtoWriterImpl; + +//////////////////////////////////////////////////////////////////////////////// + +/// Class of exceptions connected to reading or writing tables or files. +class TIOException + : public yexception +{ }; + +/////////////////////////////////////////////////////////////////////////////// + +/// Interface representing YT file reader. +class IFileReader + : public TThrRefBase + , public IInputStream +{ }; + +/// Interface representing YT file writer. +class IFileWriter + : public TThrRefBase + , public IOutputStream +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// Low-level interface to read YT table with retries. +class TRawTableReader + : public TThrRefBase + , public IInputStream +{ +public: + /// @brief Retry table read starting from the specified `rangeIndex` and `rowIndex`. + /// + /// @param rangeIndex Index of first range to read + /// @param rowIndex Index of first row to read; if `rowIndex == Nothing` entire request will be retried. + /// + /// @return `true` on successful request retry, `false` if no retry attempts are left (then `Retry()` shouldn't be called any more). + /// + /// `rowIndex` must be inside the range with index `rangeIndex` if the latter is specified. + /// + /// After successful retry the user should reset `rangeIndex` / `rowIndex` values and read new ones + /// from the stream. + virtual bool Retry( + const TMaybe<ui32>& rangeIndex, + const TMaybe<ui64>& rowIndex) = 0; + + /// Resets retry attempt count to the initial value (then `Retry()` can be called again). + virtual void ResetRetries() = 0; + + /// @brief May the input stream contain table ranges? + /// + /// In the case when it is `true` the `TRawTableReader` user is responsible + /// to track active range index in order to pass it to Retry(). + virtual bool HasRangeIndices() const = 0; +}; + +/// @brief Low-level interface to write YT table. +/// +/// Retries must be handled by implementation. +class TRawTableWriter + : public TThrRefBase + , public IOutputStream +{ +public: + /// @brief Call this method after complete row representation is written to the stream. + /// + /// When this method is called `TRowTableWriter` can check its buffer + /// and if it is full send data to YT. + /// @note `TRawTableWriter` never sends partial records to YT (due to retries). + virtual void NotifyRowEnd() = 0; + + /// @brief Try to abort writing process as soon as possible (makes sense for multi-threaded writers). + /// + /// By default it does nothing, but implementations are welcome to override this method. + virtual void Abort() + { } +}; + +/// @brief Interface to deal with multiple raw output streams. +class IProxyOutput +{ +public: + virtual ~IProxyOutput() + { } + + /// Get amount of managed streams. + virtual size_t GetStreamCount() const = 0; + + /// Get stream corresponding to the specified table index. + virtual IOutputStream* GetStream(size_t tableIndex) const = 0; + + /// This handler must be called right after the next row has been written. + virtual void OnRowFinished(size_t tableIndex) = 0; + + /// @brief Try to abort writing process as soon as possible (makes sense for multi-threaded writers). + /// + /// By default it does nothing, but implementations are welcome to override this method. + virtual void Abort() + { } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Class template to read typed rows from YT tables. +/// +/// @tparam T Row type. +/// +/// Correct usage of this class usually looks like +/// ``` +/// for (const auto& cursor : *reader) { +/// const auto& row = cursor.GetRow(); +/// ... +/// } +/// ``` +/// or, more verbosely, +/// ``` +/// for (; reader->IsValid(); reader->Next()) { +/// const auto& row = reader->GetRow(); +/// ... +/// } +/// ``` +/// +/// @note Actual (partial) specializations of this template may look a bit different, +/// e.g. @ref NYT::TTableReader::GetRow, @ref NYT::TTableReader::MoveRow may be method templates. +template <class T, class> +class TTableReader + : public TThrRefBase +{ +public: + /// Get current row. + const T& GetRow() const; + + /// Extract current row; further calls to `GetRow` and `MoveRow` will fail. + T MoveRow(); + + /// Extract current row to `result`; further calls to `GetRow` and `MoveRow` will fail. + void MoveRow(T* result); + + /// Check whether all the rows were read. + bool IsValid() const; + + /// Move the cursor to the next row. + void Next(); + + /// Get table index of the current row. + ui32 GetTableIndex() const; + + /// Get range index of the current row (zero if it is unknown or read request contains no ranges) + ui32 GetRangeIndex() const; + + /// Get current row index (zero if it unknown). + ui64 GetRowIndex() const; + + /// Get current tablet index (for ordered dynamic tables). + i64 GetTabletIndex() const; + + /// Returns `true` if job consumed all the input and `false` otherwise. + bool IsEndOfStream() const; + + /// Returns `true` if job raw input stream was closed and `false` otherwise. + bool IsRawReaderExhausted() const; +}; + +/// @brief Iterator for use in range-based-for. +/// +/// @note Idiomatic usage: +/// ``` +/// for (const auto& cursor : *reader) { +/// const auto& row = cursor.GetRow(); +/// ... +/// } +/// ``` +template <class T> +class TTableReaderIterator +{ +public: + /// Construct iterator from table reader (can be `nullptr`). + explicit TTableReaderIterator<T>(TTableReader<T>* reader) + { + if (reader && reader->IsValid()) { + Reader_ = reader; + } else { + Reader_ = nullptr; + } + } + + /// Equality operator. + bool operator==(const TTableReaderIterator& it) const + { + return Reader_ == it.Reader_; + } + + /// Inequality operator. + bool operator!=(const TTableReaderIterator& it) const + { + return Reader_ != it.Reader_; + } + + /// Dereference operator. + TTableReader<T>& operator*() + { + return *Reader_; + } + + /// Const dereference operator. + const TTableReader<T>& operator*() const + { + return *Reader_; + } + + /// Preincrement operator. + TTableReaderIterator& operator++() + { + Reader_->Next(); + if (!Reader_->IsValid()) { + Reader_ = nullptr; + } + return *this; + } + +private: + TTableReader<T>* Reader_; +}; + +/// @brief Function to facilitate range-based-for for @ref NYT::TTableReader. +/// +/// @see @ref NYT::TTableReaderIterator +template <class T> +TTableReaderIterator<T> begin(TTableReader<T>& reader) +{ + return TTableReaderIterator<T>(&reader); +} + +/// @brief Function to facilitate range-based-for for @ref NYT::TTableReader. +/// +/// @see @ref NYT::TTableReaderIterator +template <class T> +TTableReaderIterator<T> end(TTableReader<T>&) +{ + return TTableReaderIterator<T>(nullptr); +} + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Class to facilitate reading table rows sorted by key. +/// +/// Each reader returned from @ref NYT::TTableRangesReader::GetRange represents +/// a range of rows with the same key. +/// +/// @note Idiomatic usage: +/// ``` +/// for (; reader->IsValid(); reader->Next()) { +/// auto& rangeReader = reader->GetRange(); +/// ... +/// } +/// ``` +template <class T, class> +class TTableRangesReader + : public TThrRefBase +{ +public: + /// Get reader for rows with the same key. + TTableReader<T>& GetRange(); + + /// Check whether all rows are read. + bool IsValid() const; + + /// Move cursor to the next range. + void Next(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Class template to write typed rows to YT tables. +template <class T, class> +class TTableWriter + : public TThrRefBase +{ +public: + /// @brief Submit a row for writing. + /// + /// The row may (and very probably will) *not* be written immediately. + void AddRow(const T& row); + + /// Stop writing data as soon as possible (without flushing data, e.g. before aborting parent transaction). + void Finish(); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Type representing YaMR table row. +/// +/// @deprecated +struct TYaMRRow +{ + /// Key column. + TStringBuf Key; + + /// Subkey column. + TStringBuf SubKey; + + /// Value column. + TStringBuf Value; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Interface for creating table and file readers and writer. +class IIOClient +{ +public: + virtual ~IIOClient() = default; + + /// Create a reader for file at `path`. + virtual IFileReaderPtr CreateFileReader( + const TRichYPath& path, + const TFileReaderOptions& options = TFileReaderOptions()) = 0; + + /// Create a writer for file at `path`. + virtual IFileWriterPtr CreateFileWriter( + const TRichYPath& path, + const TFileWriterOptions& options = TFileWriterOptions()) = 0; + + /// Create a typed reader for table at `path`. + template <class T> + TTableReaderPtr<T> CreateTableReader( + const TRichYPath& path, + const TTableReaderOptions& options = TTableReaderOptions()); + + /// Create a typed writer for table at `path`. + template <class T> + TTableWriterPtr<T> CreateTableWriter( + const TRichYPath& path, + const TTableWriterOptions& options = TTableWriterOptions()); + + /// Create a writer to write protobuf messages with specified descriptor. + virtual TTableWriterPtr<::google::protobuf::Message> CreateTableWriter( + const TRichYPath& path, + const ::google::protobuf::Descriptor& descriptor, + const TTableWriterOptions& options = TTableWriterOptions()) = 0; + + /// Create a reader to read a table using specified format. + virtual TRawTableReaderPtr CreateRawReader( + const TRichYPath& path, + const TFormat& format, + const TTableReaderOptions& options = TTableReaderOptions()) = 0; + + /// Create a reader to write a table using specified format. + virtual TRawTableWriterPtr CreateRawWriter( + const TRichYPath& path, + const TFormat& format, + const TTableWriterOptions& options = TTableWriterOptions()) = 0; + + /// + /// @brief Create a reader for [blob table](https://docs.yandex-team.ru/docs/yt/description/storage/blobtables) at `path`. + /// + /// @param path Blob table path. + /// @param blobId Key identifying the blob. + /// @param options Optional parameters + /// + /// Blob table is a table that stores a number of blobs. + /// Blobs are sliced into parts of the same size (maybe except of last part). + /// Those parts are stored in the separate rows. + /// + /// Blob table have constraints on its schema. + /// - There must be columns that identify blob (blob id columns). That columns might be of any type. + /// - There must be a column of `int64` type that identify part inside the blob (this column is called `part index`). + /// - There must be a column of `string` type that stores actual data (this column is called `data column`). + virtual IFileReaderPtr CreateBlobTableReader( + const TYPath& path, + const TKey& blobId, + const TBlobTableReaderOptions& options = TBlobTableReaderOptions()) = 0; + +private: + virtual ::TIntrusivePtr<INodeReaderImpl> CreateNodeReader( + const TRichYPath& path, const TTableReaderOptions& options) = 0; + + virtual ::TIntrusivePtr<IYaMRReaderImpl> CreateYaMRReader( + const TRichYPath& path, const TTableReaderOptions& options) = 0; + + virtual ::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const ::google::protobuf::Message* prototype) = 0; + + virtual ::TIntrusivePtr<ISkiffRowReaderImpl> CreateSkiffRowReader( + const TRichYPath& path, + const TTableReaderOptions& options, + const ISkiffRowSkipperPtr& skipper, + const NSkiff::TSkiffSchemaPtr& schema) = 0; + + virtual ::TIntrusivePtr<INodeWriterImpl> CreateNodeWriter( + const TRichYPath& path, const TTableWriterOptions& options) = 0; + + virtual ::TIntrusivePtr<IYaMRWriterImpl> CreateYaMRWriter( + const TRichYPath& path, const TTableWriterOptions& options) = 0; + + virtual ::TIntrusivePtr<IProtoWriterImpl> CreateProtoWriter( + const TRichYPath& path, + const TTableWriterOptions& options, + const ::google::protobuf::Message* prototype) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Create a protobuf table reader from a stream. +/// +/// @tparam T Protobuf message type to read (must be inherited from `Message`). +/// +/// @param stream Input stream in YT protobuf format. +template <typename T> +TTableReaderPtr<T> CreateTableReader( + IInputStream* stream, + const TTableReaderOptions& options = {}); + +/// +/// @brief Create a protobuf multi table reader from a stream. +/// +/// @tparam Ts Protobuf message types to read (must be inherited from `Message`). +/// +/// @param stream Input stream in YT protobuf format. +template <class... Ts> +TTableReaderPtr<typename NDetail::TProtoOneOfUnique<Ts...>::TType> CreateProtoMultiTableReader( + IInputStream* stream, + const TTableReaderOptions& options = {}); + +/// +/// @brief Create a homogenous protobuf multi table reader from a stream. +/// +/// @tparam T Protobuf message type to read (must be inherited from `Message`). +/// +/// @param stream Input stream in YT protobuf format. +/// @param tableCount Number of tables in input stream. +template <class T> +TTableReaderPtr<T> CreateProtoMultiTableReader( + IInputStream* stream, + int tableCount, + const TTableReaderOptions& options = {}); + +/// Create a @ref NYT::TNode table reader from a stream. +template <> +TTableReaderPtr<TNode> CreateTableReader<TNode>( + IInputStream* stream, const TTableReaderOptions& options); + +/// Create a @ref NYT::TYaMRRow table reader from a stream. +template <> +TTableReaderPtr<TYaMRRow> CreateTableReader<TYaMRRow>( + IInputStream* stream, const TTableReaderOptions& options); + +namespace NDetail { + +/// Create a protobuf table reader from a stream. +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& options, + const ::google::protobuf::Descriptor* descriptor); + + +/// Create a protobuf table reader from a stream that can contain table switches. +::TIntrusivePtr<IProtoReaderImpl> CreateProtoReader( + IInputStream* stream, + const TTableReaderOptions& options, + TVector<const ::google::protobuf::Descriptor*> descriptors); + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +/// Convert generic protobuf table reader to a concrete one (for certain type `T`). +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(TTableReader<Message>* reader); + +/// Convert generic protobuf table reader to a concrete one (for certain type `T`). +template <typename T> +TTableReaderPtr<T> CreateConcreteProtobufReader(const TTableReaderPtr<Message>& reader); + +/// Convert a concrete (for certain type `T`) protobuf table reader to a generic one. +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(TTableReader<T>* reader); + +/// Convert a concrete (for certain type `T`) protobuf table reader to a generic one. +template <typename T> +TTableReaderPtr<Message> CreateGenericProtobufReader(const TTableReaderPtr<T>& reader); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + +#define IO_INL_H_ +#include "io-inl.h" +#undef IO_INL_H_ diff --git a/yt/cpp/mapreduce/interface/job_counters.cpp b/yt/cpp/mapreduce/interface/job_counters.cpp new file mode 100644 index 0000000000..6d4a2a6fcb --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_counters.cpp @@ -0,0 +1,164 @@ +#include "job_counters.h" + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +namespace { + ui64 CountTotal(const TNode& data) + { + if (data.IsMap()) { + if (auto totalPtr = data.AsMap().FindPtr("total")) { + return data["total"].IntCast<ui64>(); + } else { + ui64 total = 0; + for (const auto& keyVal: data.AsMap()) { + total += CountTotal(keyVal.second); + } + return total; + } + } else { + return data.IntCast<ui64>(); + } + } + + TNode GetNode(const TNode& data, const TStringBuf& key) + { + if (auto resPtr = data.AsMap().FindPtr(key)) { + return *resPtr; + } + return TNode(); + } +} // namespace + +//////////////////////////////////////////////////////////////////// + +TJobCounter::TJobCounter(TNode data) + : Data_(std::move(data)) +{ + if (Data_.HasValue()) { + Total_ = CountTotal(Data_); + } +} + +TJobCounter::TJobCounter(ui64 total) + : Total_(total) +{ } + +ui64 TJobCounter::GetTotal() const +{ + return Total_; +} + +ui64 TJobCounter::GetValue(const TStringBuf key) const +{ + if (Data_.HasValue()) { + return CountTotal(Data_[key]); + } + return 0; +} + +//////////////////////////////////////////////////////////////////// + +TJobCounters::TJobCounters(const NYT::TNode& counters) + : Total_(0) +{ + if (!counters.IsMap()) { + ythrow yexception() << "TJobCounters must be initialized with Map type TNode"; + } + auto abortedNode = GetNode(counters, "aborted"); + if (abortedNode.HasValue()) { + Aborted_ = TJobCounter(GetNode(abortedNode, "total")); + AbortedScheduled_ = TJobCounter(GetNode(abortedNode, "scheduled")); + AbortedNonScheduled_ = TJobCounter(GetNode(abortedNode, "non_scheduled")); + } + auto completedNode = GetNode(counters, "completed"); + if (completedNode.HasValue()) { + Completed_ = TJobCounter(GetNode(completedNode, "total")); + CompletedNonInterrupted_ = TJobCounter(GetNode(completedNode, "non-interrupted")); + CompletedInterrupted_ = TJobCounter(GetNode(completedNode, "interrupted")); + } + Lost_ = TJobCounter(GetNode(counters, "lost")); + Invalidated_ = TJobCounter(GetNode(counters, "invalidated")); + Failed_ = TJobCounter(GetNode(counters, "failed")); + Running_ = TJobCounter(GetNode(counters, "running")); + Suspended_ = TJobCounter(GetNode(counters, "suspended")); + Pending_ = TJobCounter(GetNode(counters, "pending")); + Blocked_ = TJobCounter(GetNode(counters, "blocked")); + Total_ = CountTotal(counters); +} + + +const TJobCounter& TJobCounters::GetAborted() const +{ + return Aborted_; +} + +const TJobCounter& TJobCounters::GetAbortedScheduled() const +{ + return AbortedScheduled_; +} + +const TJobCounter& TJobCounters::GetAbortedNonScheduled() const +{ + return AbortedNonScheduled_; +} + +const TJobCounter& TJobCounters::GetCompleted() const +{ + return Completed_; +} + +const TJobCounter& TJobCounters::GetCompletedNonInterrupted() const +{ + return CompletedNonInterrupted_; +} + +const TJobCounter& TJobCounters::GetCompletedInterrupted() const +{ + return CompletedInterrupted_; +} + +const TJobCounter& TJobCounters::GetLost() const +{ + return Lost_; +} + +const TJobCounter& TJobCounters::GetInvalidated() const +{ + return Invalidated_; +} + +const TJobCounter& TJobCounters::GetFailed() const +{ + return Failed_; +} + +const TJobCounter& TJobCounters::GetRunning() const +{ + return Running_; +} + +const TJobCounter& TJobCounters::GetSuspended() const +{ + return Suspended_; +} + +const TJobCounter& TJobCounters::GetPending() const +{ + return Pending_; +} + +const TJobCounter& TJobCounters::GetBlocked() const +{ + return Blocked_; +} + +ui64 TJobCounters::GetTotal() const +{ + return Total_; +} + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_counters.h b/yt/cpp/mapreduce/interface/job_counters.h new file mode 100644 index 0000000000..9257cc1ec1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_counters.h @@ -0,0 +1,74 @@ +#pragma once + +#include "fwd.h" + +#include <yt/cpp/mapreduce/interface/node.h> + +namespace NYT { + +class TJobCounter +{ +private: + TNode Data_; + ui64 Total_ = 0; + +public: + TJobCounter() = default; + + TJobCounter(TNode data); + TJobCounter(ui64 total); + + ui64 GetTotal() const; + + ui64 GetValue(const TStringBuf key) const; +}; + +/// Class representing a collection of job counters. +class TJobCounters +{ +public: + /// + /// Construct empty counter. + TJobCounters() = default; + + /// + /// Construct counter from counters node. + TJobCounters(const NYT::TNode& counters); + + const TJobCounter& GetAborted() const; + const TJobCounter& GetAbortedScheduled() const; + const TJobCounter& GetAbortedNonScheduled() const; + const TJobCounter& GetCompleted() const; + const TJobCounter& GetCompletedNonInterrupted() const; + const TJobCounter& GetCompletedInterrupted() const; + const TJobCounter& GetLost() const; + const TJobCounter& GetInvalidated() const; + const TJobCounter& GetFailed() const; + const TJobCounter& GetRunning() const; + const TJobCounter& GetSuspended() const; + const TJobCounter& GetPending() const; + const TJobCounter& GetBlocked() const; + + ui64 GetTotal() const; + +private: + ui64 Total_ = 0; + + TJobCounter Aborted_; + TJobCounter AbortedScheduled_; + TJobCounter AbortedNonScheduled_; + TJobCounter Completed_; + TJobCounter CompletedNonInterrupted_; + TJobCounter CompletedInterrupted_; + TJobCounter Lost_; + TJobCounter Invalidated_; + TJobCounter Failed_; + TJobCounter Running_; + TJobCounter Suspended_; + TJobCounter Pending_; + TJobCounter Blocked_; +}; + +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_counters_ut.cpp b/yt/cpp/mapreduce/interface/job_counters_ut.cpp new file mode 100644 index 0000000000..56d3932b8f --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_counters_ut.cpp @@ -0,0 +1,103 @@ +#include <yt/cpp/mapreduce/interface/job_counters.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(JobCounters) +{ + Y_UNIT_TEST(Full) + { + const TString input = R"""( + { + "completed" = { + "total" = 6; + "non-interrupted" = 1; + "interrupted" = { + "whatever_interrupted" = 2; + "whatever_else_interrupted" = 3; + }; + }; + "aborted" = { + "non_scheduled" = { + "whatever_non_scheduled" = 4; + "whatever_else_non_scheduled" = 5; + }; + "scheduled" = { + "whatever_scheduled" = 6; + "whatever_else_scheduled" = 7; + }; + "total" = 22; + }; + "lost" = 8; + "invalidated" = 9; + "failed" = 10; + "running" = 11; + "suspended" = 12; + "pending" = 13; + "blocked" = 14; + "total" = 105; + })"""; + + TJobCounters counters(NodeFromYsonString(input)); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetTotal(), 105); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompleted().GetTotal(), 6); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedNonInterrupted().GetTotal(), 1); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetTotal(), 5); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAborted().GetTotal(), 22); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetTotal(), 9); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetTotal(), 13); + UNIT_ASSERT_VALUES_EQUAL(counters.GetLost().GetTotal(), 8); + UNIT_ASSERT_VALUES_EQUAL(counters.GetInvalidated().GetTotal(), 9); + UNIT_ASSERT_VALUES_EQUAL(counters.GetFailed().GetTotal(), 10); + UNIT_ASSERT_VALUES_EQUAL(counters.GetRunning().GetTotal(), 11); + UNIT_ASSERT_VALUES_EQUAL(counters.GetSuspended().GetTotal(), 12); + UNIT_ASSERT_VALUES_EQUAL(counters.GetPending().GetTotal(), 13); + UNIT_ASSERT_VALUES_EQUAL(counters.GetBlocked().GetTotal(), 14); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetValue("whatever_interrupted"), 2); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetValue("whatever_else_interrupted"), 3); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetValue("whatever_non_scheduled"), 4); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetValue("whatever_else_non_scheduled"), 5); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetValue("whatever_scheduled"), 6); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetValue("whatever_else_scheduled"), 7); + + UNIT_ASSERT_EXCEPTION(counters.GetCompletedInterrupted().GetValue("Nothingness"), yexception); + } + + Y_UNIT_TEST(Empty) + { + const TString input = "{}"; + + TJobCounters counters(NodeFromYsonString(input)); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetTotal(), 0); + + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompleted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedNonInterrupted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetCompletedInterrupted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAborted().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedNonScheduled().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetAbortedScheduled().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetLost().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetInvalidated().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetFailed().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetRunning().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetSuspended().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetPending().GetTotal(), 0); + UNIT_ASSERT_VALUES_EQUAL(counters.GetBlocked().GetTotal(), 0); + } + + Y_UNIT_TEST(Broken) + { + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode()), yexception, "TJobCounters"); + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode(1)), yexception, "TJobCounters"); + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode(1.0)), yexception, "TJobCounters"); + UNIT_ASSERT_EXCEPTION_CONTAINS(TJobCounters(TNode("Whatever")), yexception, "TJobCounters"); + } +} diff --git a/yt/cpp/mapreduce/interface/job_statistics.cpp b/yt/cpp/mapreduce/interface/job_statistics.cpp new file mode 100644 index 0000000000..bd9791672d --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_statistics.cpp @@ -0,0 +1,361 @@ +#include "job_statistics.h" + +#include "operation.h" + +#include <library/cpp/yson/node/node.h> +#include <library/cpp/yson/node/serialize.h> + +#include <library/cpp/yson/writer.h> + +#include <util/datetime/base.h> +#include <util/generic/hash_set.h> +#include <util/generic/ptr.h> +#include <util/stream/file.h> +#include <util/string/cast.h> +#include <util/string/subst.h> +#include <util/system/file.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +template <> +i64 ConvertJobStatisticsEntry(i64 value) +{ + return value; +} + +template <> +TDuration ConvertJobStatisticsEntry(i64 value) +{ + return TDuration::MilliSeconds(value); +} + +//////////////////////////////////////////////////////////////////// + +static TTaskName JobTypeToTaskName(EJobType jobType) +{ + switch (jobType) { + case EJobType::PartitionMap: + return ETaskName::PartitionMap0; + case EJobType::Partition: + return ETaskName::Partition0; + default: + return ToString(jobType); + } +} + +static TTaskName FixTaskName(TString taskName) +{ + if (taskName == "partition") { + return ETaskName::Partition0; + } else if (taskName == "partition_map") { + return ETaskName::PartitionMap0; + } + return taskName; +} + +//////////////////////////////////////////////////////////////////// + +class TJobStatistics::TData + : public TThrRefBase +{ +public: + using TTaskName2Data = THashMap<TString, TJobStatistics::TDataEntry>; + using TState2TaskName2Data = THashMap<EJobState, TTaskName2Data>; + using TName2State2TaskName2Data = THashMap<TString, TState2TaskName2Data>; + +public: + TName2State2TaskName2Data Name2State2TaskName2Data; + +public: + TData() = default; + + TData(const TNode& statisticsNode) + { + ParseNode(statisticsNode, TString(), &Name2State2TaskName2Data); + } + + static void Aggregate(TJobStatistics::TDataEntry* result, const TJobStatistics::TDataEntry& other) + { + result->Max = Max(result->Max, other.Max); + result->Min = Min(result->Min, other.Min); + result->Sum += other.Sum; + result->Count += other.Count; + } + + static void ParseNode(const TNode& node, TState2TaskName2Data* output) + { + auto getInt = [] (const TNode& theNode, TStringBuf key) { + const auto& nodeAsMap = theNode.AsMap(); + auto it = nodeAsMap.find(key); + if (it == nodeAsMap.end()) { + ythrow yexception() << "Key '" << key << "' is not found"; + } + const auto& valueNode = it->second; + if (!valueNode.IsInt64()) { + ythrow yexception() << "Key '" << key << "' is not of int64 type"; + } + return valueNode.AsInt64(); + }; + + for (const auto& [stateStr, taskName2DataNode] : node.AsMap()) { + EJobState state; + if (!TryFromString(stateStr, state)) { + continue; + } + for (const auto& [taskName, dataNode] : taskName2DataNode.AsMap()) { + auto fixedTaskName = FixTaskName(taskName); + auto& data = (*output)[state][fixedTaskName.Get()]; + data.Max = getInt(dataNode, "max"); + data.Min = getInt(dataNode, "min"); + data.Sum = getInt(dataNode, "sum"); + data.Count = getInt(dataNode, "count"); + } + } + } + + static void ParseNode(const TNode& node, const TString& curPath, TName2State2TaskName2Data* output) + { + Y_VERIFY(node.IsMap()); + + for (const auto& [key, value] : node.AsMap()) { + if (key == "$"sv) { + ParseNode(value, &(*output)[curPath]); + } else { + TString childPath = curPath; + if (!childPath.empty()) { + childPath.push_back('/'); + } + if (key.find_first_of('/') != key.npos) { + TString keyCopy(key); + SubstGlobal(keyCopy, "/", "\\/"); + childPath += keyCopy; + } else { + childPath += key; + } + ParseNode(value, childPath, output); + } + } + } +}; + +//////////////////////////////////////////////////////////////////// + +struct TJobStatistics::TFilter + : public TThrRefBase +{ + TVector<TTaskName> TaskNameFilter; + TVector<EJobState> JobStateFilter = {EJobState::Completed}; +}; + +//////////////////////////////////////////////////////////////////// + +const TString TJobStatistics::CustomStatisticsNamePrefix_ = "custom/"; + +TJobStatistics::TJobStatistics() + : Data_(::MakeIntrusive<TData>()) + , Filter_(::MakeIntrusive<TFilter>()) +{ } + + +TJobStatistics::TJobStatistics(const NYT::TNode& statisticsNode) + : Data_(::MakeIntrusive<TData>(statisticsNode)) + , Filter_(::MakeIntrusive<TFilter>()) +{ } + +TJobStatistics::TJobStatistics(::TIntrusivePtr<TData> data, ::TIntrusivePtr<TFilter> filter) + : Data_(data) + , Filter_(::MakeIntrusive<TFilter>(*filter)) +{ } + +TJobStatistics::TJobStatistics(const TJobStatistics& jobStatistics) = default; +TJobStatistics::TJobStatistics(TJobStatistics&&) = default; + +TJobStatistics& TJobStatistics::operator=(const TJobStatistics& jobStatistics) = default; +TJobStatistics& TJobStatistics::operator=(TJobStatistics&& jobStatistics) = default; + +TJobStatistics::~TJobStatistics() = default; + +TJobStatistics TJobStatistics::TaskName(TVector<TTaskName> taskNames) const +{ + auto newFilter = ::MakeIntrusive<TFilter>(*Filter_); + newFilter->TaskNameFilter = std::move(taskNames); + return TJobStatistics(Data_, std::move(newFilter)); +} + +TJobStatistics TJobStatistics::JobState(TVector<EJobState> jobStates) const +{ + auto newFilter = ::MakeIntrusive<TFilter>(*Filter_); + newFilter->JobStateFilter = std::move(jobStates); + return TJobStatistics(Data_, std::move(newFilter)); +} + +TJobStatistics TJobStatistics::JobType(TVector<EJobType> jobTypes) const +{ + TVector<TTaskName> taskNames; + for (auto jobType : jobTypes) { + taskNames.push_back(JobTypeToTaskName(jobType)); + } + return TaskName(std::move(taskNames)); +} + +bool TJobStatistics::HasStatistics(TStringBuf name) const +{ + return Data_->Name2State2TaskName2Data.contains(name); +} + +TJobStatisticsEntry<i64> TJobStatistics::GetStatistics(TStringBuf name) const +{ + return GetStatisticsAs<i64>(name); +} + +TVector<TString> TJobStatistics::GetStatisticsNames() const +{ + TVector<TString> result; + result.reserve(Data_->Name2State2TaskName2Data.size()); + for (const auto& entry : Data_->Name2State2TaskName2Data) { + result.push_back(entry.first); + } + return result; +} + +bool TJobStatistics::HasCustomStatistics(TStringBuf name) const +{ + return HasStatistics(CustomStatisticsNamePrefix_ + name); +} + +TJobStatisticsEntry<i64> TJobStatistics::GetCustomStatistics(TStringBuf name) const +{ + return GetCustomStatisticsAs<i64>(name); +} + +TVector<TString> TJobStatistics::GetCustomStatisticsNames() const +{ + TVector<TString> result; + for (const auto& entry : Data_->Name2State2TaskName2Data) { + if (entry.first.StartsWith(CustomStatisticsNamePrefix_)) { + result.push_back(entry.first.substr(CustomStatisticsNamePrefix_.size())); + } + } + return result; +} + +TMaybe<TJobStatistics::TDataEntry> TJobStatistics::GetStatisticsImpl(TStringBuf name) const +{ + auto name2State2TaskName2DataIt = Data_->Name2State2TaskName2Data.find(name); + Y_ENSURE( + name2State2TaskName2DataIt != Data_->Name2State2TaskName2Data.end(), + "Statistics '" << name << "' are missing"); + const auto& state2TaskName2Data = name2State2TaskName2DataIt->second; + + TMaybe<TDataEntry> result; + auto aggregate = [&] (const TDataEntry& data) { + if (result) { + TData::Aggregate(&result.GetRef(), data); + } else { + result = data; + } + }; + + auto aggregateTaskName2Data = [&] (const TData::TTaskName2Data& taskName2Data) { + if (Filter_->TaskNameFilter.empty()) { + for (const auto& [taskName, data] : taskName2Data) { + aggregate(data); + } + } else { + for (const auto& taskName : Filter_->TaskNameFilter) { + auto it = taskName2Data.find(taskName.Get()); + if (it == taskName2Data.end()) { + continue; + } + const auto& data = it->second; + aggregate(data); + } + } + }; + + if (Filter_->JobStateFilter.empty()) { + for (const auto& [state, taskName2Data] : state2TaskName2Data) { + aggregateTaskName2Data(taskName2Data); + } + } else { + for (auto state : Filter_->JobStateFilter) { + auto it = state2TaskName2Data.find(state); + if (it == state2TaskName2Data.end()) { + continue; + } + const auto& taskName2Data = it->second; + aggregateTaskName2Data(taskName2Data); + } + } + + return result; +} + +//////////////////////////////////////////////////////////////////// + +namespace { + +constexpr int USER_STATISTICS_FILE_DESCRIPTOR = 5; +constexpr char PATH_DELIMITER = '/'; +constexpr char ESCAPE = '\\'; + +IOutputStream* GetStatisticsStream() +{ + static TFile file = Duplicate(USER_STATISTICS_FILE_DESCRIPTOR); + static TFileOutput stream(file); + return &stream; +} + +template <typename T> +void WriteCustomStatisticsAny(TStringBuf path, const T& value) +{ + ::NYson::TYsonWriter writer(GetStatisticsStream(), NYson::EYsonFormat::Binary, ::NYson::EYsonType::ListFragment); + int depth = 0; + size_t begin = 0; + size_t end = 0; + TVector<TString> items; + while (end <= path.size()) { + if (end + 1 < path.size() && path[end] == ESCAPE && path[end + 1] == PATH_DELIMITER) { + end += 2; + continue; + } + if (end == path.size() || path[end] == PATH_DELIMITER) { + writer.OnBeginMap(); + items.emplace_back(path.data() + begin, end - begin); + SubstGlobal(items.back(), "\\/", "/"); + writer.OnKeyedItem(TStringBuf(items.back())); + ++depth; + begin = end + 1; + } + ++end; + } + Serialize(value, &writer); + while (depth > 0) { + writer.OnEndMap(); + --depth; + } +} + +} + +//////////////////////////////////////////////////////////////////// + +void WriteCustomStatistics(const TNode& statistics) +{ + ::NYson::TYsonWriter writer(GetStatisticsStream(), NYson::EYsonFormat::Binary, ::NYson::EYsonType::ListFragment); + Serialize(statistics, &writer); +} + +void WriteCustomStatistics(TStringBuf path, i64 value) +{ + WriteCustomStatisticsAny(path, value); +} + +void FlushCustomStatisticsStream() { + GetStatisticsStream()->Flush(); +} +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_statistics.h b/yt/cpp/mapreduce/interface/job_statistics.h new file mode 100644 index 0000000000..8af751604f --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_statistics.h @@ -0,0 +1,268 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/job_statistics.h +/// +/// Header containing classes and utility functions to work with +/// [job statistics](https://docs.yandex-team.ru/yt/problems/jobstatistics). + +#include "fwd.h" + +#include <library/cpp/yson/node/node.h> + +#include <util/system/defaults.h> +#include <util/generic/maybe.h> +#include <util/generic/ptr.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Convert i64 representation of statistics to other type. +/// +/// Library defines this template for types TDuration and i64. +/// Users may define it for their types. +/// +/// @see @ref NYT::TJobStatistics::GetStatisticsAs method. +template <typename T> +T ConvertJobStatisticsEntry(i64 value); + +//////////////////////////////////////////////////////////////////// + +/// Class representing a collection of job statistics. +class TJobStatistics +{ +public: + /// + /// Construct empty statistics. + TJobStatistics(); + + /// + /// Construct statistics from statistics node. + TJobStatistics(const NYT::TNode& statistics); + + TJobStatistics(const TJobStatistics& jobStatistics); + TJobStatistics(TJobStatistics&& jobStatistics); + + TJobStatistics& operator=(const TJobStatistics& jobStatistics); + TJobStatistics& operator=(TJobStatistics&& jobStatistics); + + ~TJobStatistics(); + + /// + /// @brief Filter statistics by task name. + /// + /// @param taskNames What task names to include (empty means all). + TJobStatistics TaskName(TVector<TTaskName> taskNames) const; + + /// + /// @brief Filter statistics by job state. + /// + /// @param filter What job states to include (empty means all). + /// + /// @note Default statistics include only (successfully) completed jobs. + TJobStatistics JobState(TVector<EJobState> filter) const; + + /// + /// @brief Filter statistics by job type. + /// + /// @param filter What job types to include (empty means all). + /// + /// @deprecated Use @ref TJobStatistics::TaskName instead. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/jobs#obshaya-shema + TJobStatistics JobType(TVector<EJobType> filter) const; + + /// + /// @brief Check that given statistics exist. + /// + /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface). + bool HasStatistics(TStringBuf name) const; + + /// + /// @brief Get statistics by name. + /// + /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface). + /// + /// @note If statistics is missing an exception is thrown. If because of filters + /// no fields remain the returned value is empty (all fields are `Nothing`). + /// + /// @note We don't use `TMaybe<TJobStatisticsEntry>` here; + /// instead, @ref NYT::TJobStatisticsEntry methods return `TMaybe<i64>`, + /// so user easier use `.GetOrElse`: + /// ``` + /// jobStatistics.GetStatistics("some/statistics/name").Max().GetOrElse(0); + /// ``` + TJobStatisticsEntry<i64> GetStatistics(TStringBuf name) const; + + /// + /// @brief Get statistics by name. + /// + /// @param name Slash separated statistics name, e.g. "time/total" (like it appears in web interface). + /// + /// @note In order to use `GetStatisticsAs` method, @ref NYT::ConvertJobStatisticsEntry function must be defined + /// (the library defines it for `i64` and `TDuration`, user may define it for other types). + template <typename T> + TJobStatisticsEntry<T> GetStatisticsAs(TStringBuf name) const; + + /// + /// Get (slash separated) names of statistics. + TVector<TString> GetStatisticsNames() const; + + /// + /// @brief Check if given custom statistics exists. + /// + /// @param name Slash separated custom statistics name. + bool HasCustomStatistics(TStringBuf name) const; + + /// + /// @brief Get custom statistics (those the user can write in job with @ref NYT::WriteCustomStatistics). + /// + /// @param name Slash separated custom statistics name. + TJobStatisticsEntry<i64> GetCustomStatistics(TStringBuf name) const; + + /// + /// @brief Get custom statistics (those the user can write in job with @ref NYT::WriteCustomStatistics). + /// + /// @param name Slash separated custom statistics name. + template <typename T> + TJobStatisticsEntry<T> GetCustomStatisticsAs(TStringBuf name) const; + + /// + /// Get names of all custom statistics. + TVector<TString> GetCustomStatisticsNames() const; + +private: + class TData; + struct TFilter; + + struct TDataEntry { + i64 Max; + i64 Min; + i64 Sum; + i64 Count; + }; + + static const TString CustomStatisticsNamePrefix_; + +private: + TJobStatistics(::TIntrusivePtr<TData> data, ::TIntrusivePtr<TFilter> filter); + + TMaybe<TDataEntry> GetStatisticsImpl(TStringBuf name) const; + +private: + ::TIntrusivePtr<TData> Data_; + ::TIntrusivePtr<TFilter> Filter_; + +private: + template<typename T> + friend class TJobStatisticsEntry; +}; + +//////////////////////////////////////////////////////////////////// + +/// Class representing single statistic. +template <typename T> +class TJobStatisticsEntry +{ +public: + TJobStatisticsEntry(TMaybe<TJobStatistics::TDataEntry> data) + : Data_(std::move(data)) + { } + + /// Sum of the statistic over all jobs. + TMaybe<T> Sum() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Sum); + } + return Nothing(); + } + + /// @brief Average of the statistic over all jobs. + /// + /// @note Only jobs that emitted statistics are taken into account. + TMaybe<T> Avg() const + { + if (Data_ && Data_->Count) { + return ConvertJobStatisticsEntry<T>(Data_->Sum / Data_->Count); + } + return Nothing(); + } + + /// @brief Number of jobs that emitted this statistic. + TMaybe<T> Count() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Count); + } + return Nothing(); + } + + /// @brief Maximum value of the statistic over all jobs. + TMaybe<T> Max() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Max); + } + return Nothing(); + } + + /// @brief Minimum value of the statistic over all jobs. + TMaybe<T> Min() const + { + if (Data_) { + return ConvertJobStatisticsEntry<T>(Data_->Min); + } + return Nothing(); + } + +private: + TMaybe<TJobStatistics::TDataEntry> Data_; + +private: + friend class TJobStatistics; +}; + +//////////////////////////////////////////////////////////////////// + +template <typename T> +TJobStatisticsEntry<T> TJobStatistics::GetStatisticsAs(TStringBuf name) const +{ + return TJobStatisticsEntry<T>(GetStatisticsImpl(name)); +} + +template <typename T> +TJobStatisticsEntry<T> TJobStatistics::GetCustomStatisticsAs(TStringBuf name) const +{ + return TJobStatisticsEntry<T>(GetStatisticsImpl(CustomStatisticsNamePrefix_ + name)); +} + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Write [custom statistics](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats). +/// +/// @param path Slash-separated path (length must not exceed 512 bytes). +/// @param value Value of the statistic. +/// +/// @note The function must be called in job. +/// Total number of statistics (with different paths) must not exceed 128. +void WriteCustomStatistics(TStringBuf path, i64 value); + +/// +/// @brief Write several [custom statistics](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats) at once. +/// +/// @param statistics A tree of map nodes with leaves of type `i64`. +/// +/// @note The call is equivalent to calling @ref NYT::WriteCustomStatistics(TStringBuf, i64) for every path in the given map. +void WriteCustomStatistics(const TNode& statistics); + +/// +/// @brief Flush [custom statistics stream](https://yt.yandex-team.ru/docs/description/mr/jobs#user_stats) +/// +void FlushCustomStatisticsStream(); +//////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/job_statistics_ut.cpp b/yt/cpp/mapreduce/interface/job_statistics_ut.cpp new file mode 100644 index 0000000000..0cf53d771a --- /dev/null +++ b/yt/cpp/mapreduce/interface/job_statistics_ut.cpp @@ -0,0 +1,257 @@ +#include <yt/cpp/mapreduce/interface/job_statistics.h> +#include <yt/cpp/mapreduce/interface/operation.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(JobStatistics) +{ + Y_UNIT_TEST(Simple) + { + const TString input = R"""( + { + "data" = { + "output" = { + "0" = { + "uncompressed_data_size" = { + "$" = { + "completed" = { + "simple_sort" = { + "max" = 130; + "count" = 1; + "min" = 130; + "sum" = 130; + }; + "map" = { + "max" = 42; + "count" = 1; + "min" = 42; + "sum" = 42; + }; + }; + "aborted" = { + "simple_sort" = { + "max" = 24; + "count" = 1; + "min" = 24; + "sum" = 24; + }; + }; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT(stat.HasStatistics("data/output/0/uncompressed_data_size")); + UNIT_ASSERT(!stat.HasStatistics("nonexistent-statistics")); + UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetStatistics("BLAH-BLAH"), yexception, "Statistics"); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsNames(), TVector<TString>{"data/output/0/uncompressed_data_size"}); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Max(), 130); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Count(), 2); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Min(), 42); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Sum(), 172); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Avg(), 172 / 2); + + UNIT_ASSERT_VALUES_EQUAL(stat.JobState({EJobState::Aborted}).GetStatistics("data/output/0/uncompressed_data_size").Sum(), 24); + UNIT_ASSERT_VALUES_EQUAL(stat.JobType({EJobType::Map}).JobState({EJobState::Aborted}).GetStatistics("data/output/0/uncompressed_data_size").Sum(), TMaybe<i64>()); + } + + Y_UNIT_TEST(TestOtherTypes) + { + const TString input = R"""( + { + "time" = { + "exec" = { + "$" = { + "completed" = { + "map" = { + "max" = 2482468; + "count" = 38; + "min" = 578976; + "sum" = 47987270; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsAs<TDuration>("time/exec").Max(), TDuration::MilliSeconds(2482468)); + } + + Y_UNIT_TEST(Custom) + { + const TString input = R"""( + { + "custom" = { + "some" = { + "path" = { + "$" = { + "completed" = { + "map" = { + "max" = -1; + "count" = 1; + "min" = -1; + "sum" = -1; + }; + }; + }; + }; + }; + "another" = { + "path" = { + "$" = { + "completed" = { + "map" = { + "max" = 1001; + "count" = 2; + "min" = 1001; + "sum" = 2002; + }; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT(stat.HasCustomStatistics("some/path")); + UNIT_ASSERT(!stat.HasCustomStatistics("nonexistent-statistics")); + UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetCustomStatistics("BLAH-BLAH"), yexception, "Statistics"); + + const auto names = stat.GetCustomStatisticsNames(); + const THashSet<TString> expected = {"some/path", "another/path"}; + UNIT_ASSERT_VALUES_EQUAL(THashSet<TString>(names.begin(), names.end()), expected); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetCustomStatistics("some/path").Max(), -1); + UNIT_ASSERT_VALUES_EQUAL(stat.GetCustomStatistics("another/path").Avg(), 1001); + } + + Y_UNIT_TEST(TaskNames) + { + const TString input = R"""( + { + "data" = { + "output" = { + "0" = { + "uncompressed_data_size" = { + "$" = { + "completed" = { + "partition_map" = { + "max" = 130; + "count" = 1; + "min" = 130; + "sum" = 130; + }; + "partition(0)" = { + "max" = 42; + "count" = 1; + "min" = 42; + "sum" = 42; + }; + }; + "aborted" = { + "simple_sort" = { + "max" = 24; + "count" = 1; + "min" = 24; + "sum" = 24; + }; + }; + }; + }; + }; + }; + }; + })"""; + + TJobStatistics stat(NodeFromYsonString(input)); + + UNIT_ASSERT(stat.HasStatistics("data/output/0/uncompressed_data_size")); + UNIT_ASSERT(!stat.HasStatistics("nonexistent-statistics")); + UNIT_ASSERT_EXCEPTION_CONTAINS(stat.GetStatistics("BLAH-BLAH"), yexception, "Statistics"); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatisticsNames(), TVector<TString>{"data/output/0/uncompressed_data_size"}); + + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Max(), 130); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Count(), 2); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Min(), 42); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Sum(), 172); + UNIT_ASSERT_VALUES_EQUAL(stat.GetStatistics("data/output/0/uncompressed_data_size").Avg(), 172 / 2); + + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobState({EJobState::Aborted}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 24); + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobType({EJobType::Partition}) + .JobState({EJobState::Aborted}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + TMaybe<i64>()); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({"partition(0)"}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 42); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({"partition"}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + TMaybe<i64>()); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({"partition_map(0)"}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 130); + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobType({EJobType::Partition}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 42); + UNIT_ASSERT_VALUES_EQUAL( + stat + .JobType({EJobType::PartitionMap}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 130); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({ETaskName::Partition0}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 42); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({ETaskName::Partition1}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + TMaybe<i64>()); + UNIT_ASSERT_VALUES_EQUAL( + stat + .TaskName({ETaskName::PartitionMap0}) + .GetStatistics("data/output/0/uncompressed_data_size") + .Sum(), + 130); + } +} diff --git a/yt/cpp/mapreduce/interface/logging/logger.cpp b/yt/cpp/mapreduce/interface/logging/logger.cpp new file mode 100644 index 0000000000..bfa56b94f6 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/logger.cpp @@ -0,0 +1,188 @@ +#include "logger.h" + +#include <util/datetime/base.h> + +#include <util/stream/file.h> +#include <util/stream/format.h> +#include <util/stream/printf.h> +#include <util/stream/str.h> + +#include <util/system/mutex.h> +#include <util/system/rwlock.h> +#include <util/system/thread.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +static TStringBuf StripFileName(TStringBuf path) { + TStringBuf l, r; + if (path.TryRSplit('/', l, r) || path.TryRSplit('\\', l, r)) { + return r; + } else { + return path; + } +} + +static char GetLogLevelCode(ILogger::ELevel level) { + switch (level) { + case ILogger::FATAL: return 'F'; + case ILogger::ERROR: return 'E'; + case ILogger::INFO: return 'I'; + case ILogger::DEBUG: return 'D'; + } + Y_UNREACHABLE(); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TNullLogger + : public ILogger +{ +public: + void Log(ELevel level, const TSourceLocation& sourceLocation, const char* format, va_list args) override + { + Y_UNUSED(level); + Y_UNUSED(sourceLocation); + Y_UNUSED(format); + Y_UNUSED(args); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TLoggerBase + : public ILogger +{ +public: + TLoggerBase(ELevel cutLevel) + : CutLevel_(cutLevel) + { } + + virtual void OutputLine(const TString& line) = 0; + + void Log(ELevel level, const TSourceLocation& sourceLocation, const char* format, va_list args) override + { + if (level > CutLevel_) { + return; + } + + TStringStream stream; + stream << TInstant::Now().ToStringLocal() + << " " << GetLogLevelCode(level) + << " [" << Hex(TThread::CurrentThreadId(), HF_FULL) << "] "; + Printf(stream, format, args); + stream << " - " << StripFileName(sourceLocation.File) << ':' << sourceLocation.Line << Endl; + + TGuard<TMutex> guard(Mutex_); + OutputLine(stream.Str()); + } + +private: + ELevel CutLevel_; + TMutex Mutex_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TStdErrLogger + : public TLoggerBase +{ +public: + TStdErrLogger(ELevel cutLevel) + : TLoggerBase(cutLevel) + { } + + void OutputLine(const TString& line) override + { + Cerr << line; + } +}; + +ILoggerPtr CreateStdErrLogger(ILogger::ELevel cutLevel) +{ + return new TStdErrLogger(cutLevel); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TFileLogger + : public TLoggerBase +{ +public: + TFileLogger(ELevel cutLevel, const TString& path, bool append) + : TLoggerBase(cutLevel) + , Stream_(TFile(path, OpenAlways | WrOnly | Seq | (append ? ForAppend : EOpenMode()))) + { } + + void OutputLine(const TString& line) override + { + Stream_ << line; + } + +private: + TUnbufferedFileOutput Stream_; +}; + +ILoggerPtr CreateFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append) +{ + return new TFileLogger(cutLevel, path, append); +} +//////////////////////////////////////////////////////////////////////////////// + +class TBufferedFileLogger + : public TLoggerBase +{ +public: + TBufferedFileLogger(ELevel cutLevel, const TString& path, bool append) + : TLoggerBase(cutLevel) + , Stream_(TFile(path, OpenAlways | WrOnly | Seq | (append ? ForAppend : EOpenMode()))) + { } + + void OutputLine(const TString& line) override + { + Stream_ << line; + } + +private: + TFileOutput Stream_; +}; + +ILoggerPtr CreateBufferedFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append) +{ + return new TBufferedFileLogger(cutLevel, path, append); +} + +//////////////////////////////////////////////////////////////////////////////// + +static TRWMutex LoggerMutex; +static ILoggerPtr Logger; + +struct TLoggerInitializer +{ + TLoggerInitializer() + { + Logger = new TNullLogger; + } +} LoggerInitializer; + +void SetLogger(ILoggerPtr logger) +{ + auto guard = TWriteGuard(LoggerMutex); + if (logger) { + Logger = logger; + } else { + Logger = new TNullLogger; + } +} + +ILoggerPtr GetLogger() +{ + auto guard = TReadGuard(LoggerMutex); + return Logger; +} + +//////////////////////////////////////////////////////////////////////////////// + +} + diff --git a/yt/cpp/mapreduce/interface/logging/logger.h b/yt/cpp/mapreduce/interface/logging/logger.h new file mode 100644 index 0000000000..2b5aae87d1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/logger.h @@ -0,0 +1,43 @@ +#pragma once + +#include <util/generic/ptr.h> +#include <util/generic/string.h> +#include <util/system/compat.h> +#include <util/system/src_location.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +class ILogger + : public TThrRefBase +{ +public: + enum ELevel + { + FATAL /* "fatal", "FATAL" */, + // We don't have such level as `warning', but we support it for compatibility with other APIs. + ERROR /* "error", "warning", "ERROR", "WARNING" */, + INFO /* "info", "INFO" */, + DEBUG /* "debug", "DEBUG" */ + }; + + virtual void Log(ELevel level, const ::TSourceLocation& sourceLocation, const char* format, va_list args) = 0; +}; + +using ILoggerPtr = ::TIntrusivePtr<ILogger>; + +void SetLogger(ILoggerPtr logger); +ILoggerPtr GetLogger(); + +ILoggerPtr CreateStdErrLogger(ILogger::ELevel cutLevel); +ILoggerPtr CreateFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append = false); + +/** + * Create logger that writes to a file in a buffered manner. + * It should result in fewer system calls (useful if you expect a lot of log messages), + * but in case of a crash, you would lose some log messages that haven't been flushed yet. + */ +ILoggerPtr CreateBufferedFileLogger(ILogger::ELevel cutLevel, const TString& path, bool append = false); + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/logging/ya.make b/yt/cpp/mapreduce/interface/logging/ya.make new file mode 100644 index 0000000000..8095bfe4ba --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/ya.make @@ -0,0 +1,16 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + logger.cpp + yt_log.cpp +) + +PEERDIR( + library/cpp/yt/logging +) + +GENERATE_ENUM_SERIALIZATION(logger.h) + +END() diff --git a/yt/cpp/mapreduce/interface/logging/yt_log.cpp b/yt/cpp/mapreduce/interface/logging/yt_log.cpp new file mode 100644 index 0000000000..9fa7b91580 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/yt_log.cpp @@ -0,0 +1,126 @@ +#include "yt_log.h" + +#include "logger.h" + +#include <util/generic/guid.h> + +#include <util/system/mutex.h> + +namespace NYT { + +using namespace NLogging; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +class TLogManager + : public ILogManager +{ +public: + static constexpr TStringBuf CategoryName = "Wrapper"; + +public: + void RegisterStaticAnchor( + TLoggingAnchor* anchor, + ::TSourceLocation sourceLocation, + TStringBuf anchorMessage) override + { + if (anchor->Registered.exchange(true)) { + return; + } + + anchor->Enabled.store(true); + + auto guard = Guard(Mutex_); + anchor->SourceLocation = sourceLocation; + anchor->AnchorMessage = anchorMessage; + } + + void UpdateAnchor(TLoggingAnchor* /*position*/) override + { } + + void Enqueue(TLogEvent&& event) override + { + auto message = TString(event.MessageRef.ToStringBuf()); + LogMessage( + ToImplLevel(event.Level), + ::TSourceLocation(event.SourceFile, event.SourceLine), + "%.*s", + event.MessageRef.size(), + event.MessageRef.begin()); + } + + const TLoggingCategory* GetCategory(TStringBuf categoryName) override + { + Y_VERIFY(categoryName == CategoryName); + return &Category_; + } + + void UpdateCategory(TLoggingCategory* /*category*/) override + { + Y_FAIL(); + } + + bool GetAbortOnAlert() const override + { + return false; + } + +private: + static ILogger::ELevel ToImplLevel(ELogLevel level) + { + switch (level) { + case ELogLevel::Minimum: + case ELogLevel::Trace: + case ELogLevel::Debug: + return ILogger::ELevel::DEBUG; + case ELogLevel::Info: + return ILogger::ELevel::INFO; + case ELogLevel::Warning: + case ELogLevel::Error: + return ILogger::ELevel::ERROR; + case ELogLevel::Alert: + case ELogLevel::Fatal: + case ELogLevel::Maximum: + return ILogger::ELevel::FATAL; + } + } + + static void LogMessage(ILogger::ELevel level, const ::TSourceLocation& sourceLocation, const char* format, ...) + { + va_list args; + va_start(args, format); + GetLogger()->Log(level, sourceLocation, format, args); + va_end(args); + } + +private: + ::TMutex Mutex_; + std::atomic<int> ActualVersion_{1}; + const TLoggingCategory Category_{ + .Name{CategoryName}, + .MinPlainTextLevel{ELogLevel::Minimum}, + .CurrentVersion{1}, + .ActualVersion = &ActualVersion_, + }; +}; + +TLogManager LogManager; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +TLogger Logger(&LogManager, TLogManager::CategoryName); + +//////////////////////////////////////////////////////////////////////////////// + +void FormatValue(TStringBuilderBase* builder, const TGUID& value, TStringBuf /*format*/) +{ + builder->AppendString(GetGuidAsString(value)); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/logging/yt_log.h b/yt/cpp/mapreduce/interface/logging/yt_log.h new file mode 100644 index 0000000000..4cf93a6ba1 --- /dev/null +++ b/yt/cpp/mapreduce/interface/logging/yt_log.h @@ -0,0 +1,17 @@ +#pragma once + +#include <library/cpp/yt/logging/logger.h> + +struct TGUID; + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +extern NLogging::TLogger Logger; + +void FormatValue(TStringBuilderBase* builder, const TGUID& value, TStringBuf format); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/mpl.h b/yt/cpp/mapreduce/interface/mpl.h new file mode 100644 index 0000000000..9865e28b6c --- /dev/null +++ b/yt/cpp/mapreduce/interface/mpl.h @@ -0,0 +1,73 @@ +#pragma once + +#include "fwd.h" + +#include <tuple> +#include <type_traits> + +namespace NYT { + +/// @cond Doxygen_Suppress + +//////////////////////////////////////////////////////////////////////////////// + +template <class TBase, class TDerived> +struct TIsBaseOf +{ + static constexpr bool Value = std::is_base_of_v<TBase, TDerived> && !std::is_same_v<TBase, TDerived>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +template <class T, class Tuple> +struct TIndexInTuple; + +template <class T, class... Types> +struct TIndexInTuple<T, std::tuple<T, Types...>> +{ + static constexpr int Value = 0; +}; + +template <class T> +struct TIndexInTuple<T, std::tuple<>> +{ + static constexpr int Value = 0; +}; + +template <class T, class U, class... Types> +struct TIndexInTuple<T, std::tuple<U, Types...>> +{ + static constexpr int Value = 1 + TIndexInTuple<T, std::tuple<Types...>>::Value; +}; + +template <class T, class TTuple> +constexpr bool DoesTupleContainType = (TIndexInTuple<T, TTuple>::Value < std::tuple_size<TTuple>{}); + +template <class TOut, class TIn = std::tuple<>> +struct TUniqueTypes; + +template <class... TOut, class TInCar, class... TInCdr> +struct TUniqueTypes<std::tuple<TOut...>, std::tuple<TInCar, TInCdr...>> +{ + using TType = std::conditional_t< + DoesTupleContainType<TInCar, std::tuple<TOut...>>, + typename TUniqueTypes<std::tuple<TOut...>, std::tuple<TInCdr...>>::TType, + typename TUniqueTypes<std::tuple<TOut..., TInCar>, std::tuple<TInCdr...>>::TType + >; +}; + +template <class TOut> +struct TUniqueTypes<TOut, std::tuple<>> +{ + using TType = TOut; +}; + +} // namespace NDetail + +/// @endcond Doxygen_Suppress + +//////////////////////////////////////////////////////////////////////////////// + +} diff --git a/yt/cpp/mapreduce/interface/node.h b/yt/cpp/mapreduce/interface/node.h new file mode 100644 index 0000000000..fece1b36de --- /dev/null +++ b/yt/cpp/mapreduce/interface/node.h @@ -0,0 +1,7 @@ +#pragma once + +// Backward compatibility +#include "fwd.h" +#include <library/cpp/yson/node/node.h> + + diff --git a/yt/cpp/mapreduce/interface/operation-inl.h b/yt/cpp/mapreduce/interface/operation-inl.h new file mode 100644 index 0000000000..8d53cd446f --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation-inl.h @@ -0,0 +1,928 @@ +#pragma once + +#ifndef OPERATION_INL_H_ +#error "Direct inclusion of this file is not allowed, use operation.h" +#include "operation.h" +#endif +#undef OPERATION_INL_H_ + +#include "errors.h" + +#include <util/generic/bt_exception.h> +#include <util/generic/singleton.h> +#include <util/system/type_name.h> + +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <util/string/subst.h> + +#include <typeindex> + +namespace NYT { + +namespace NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +template<class T> +void Assign(TVector<T>& array, size_t idx, const T& value) { + array.resize(std::max(array.size(), idx + 1)); + array[idx] = value; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRow> +TStructuredRowStreamDescription GetStructuredRowStreamDescription() +{ + if constexpr (std::is_same_v<TRow, NYT::TNode>) { + return TTNodeStructuredRowStream{}; + } else if constexpr (std::is_same_v<TRow, NYT::TYaMRRow>) { + return TTYaMRRowStructuredRowStream{}; + } else if constexpr (std::is_same_v<::google::protobuf::Message, TRow>) { + return TProtobufStructuredRowStream{nullptr}; + } else if constexpr (TIsBaseOf<::google::protobuf::Message, TRow>::Value) { + return TProtobufStructuredRowStream{TRow::descriptor()}; + } else if constexpr (TIsProtoOneOf<TRow>::value) { + return TProtobufStructuredRowStream{nullptr}; + } else { + static_assert(TDependentFalse<TRow>, "Unknown row type"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRow> +TStructuredTablePath Structured(TRichYPath richYPath) +{ + return TStructuredTablePath(std::move(richYPath), StructuredTableDescription<TRow>()); +} + +template <typename TRow> +TTableStructure StructuredTableDescription() +{ + if constexpr (std::is_same_v<TRow, NYT::TNode>) { + return TUnspecifiedTableStructure{}; + } else if constexpr (std::is_same_v<TRow, NYT::TYaMRRow>) { + return TUnspecifiedTableStructure{}; + } else if constexpr (std::is_base_of_v<::google::protobuf::Message, TRow>) { + if constexpr (std::is_same_v<::google::protobuf::Message, TRow>) { + static_assert(TDependentFalse<TRow>, "Cannot use ::google::protobuf::Message as table descriptor"); + } else { + return TProtobufTableStructure{TRow::descriptor()}; + } + } else { + static_assert(TDependentFalse<TRow>, "Unknown row type"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::AddInput(const TRichYPath& path) +{ + Inputs_.push_back(path); + return static_cast<TDerived&>(*this); +} + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::SetInput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Inputs_, tableIndex, path); +} + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::AddOutput(const TRichYPath& path) +{ + Outputs_.push_back(path); + return static_cast<TDerived&>(*this); +} + +template <typename TDerived> +TDerived& TRawOperationIoTableSpec<TDerived>::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Outputs_, tableIndex, path); +} + +template <typename TDerived> +const TVector<TRichYPath>& TRawOperationIoTableSpec<TDerived>::GetInputs() const +{ + return Inputs_; +} + +template <typename TDerived> +const TVector<TRichYPath>& TRawOperationIoTableSpec<TDerived>::GetOutputs() const +{ + return Outputs_; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TDerived> +TDerived& TRawMapReduceOperationIoSpec<TDerived>::AddMapOutput(const TRichYPath& path) +{ + MapOutputs_.push_back(path); + return static_cast<TDerived&>(*this); +} + +template <typename TDerived> +TDerived& TRawMapReduceOperationIoSpec<TDerived>::SetMapOutput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(MapOutputs_, tableIndex, path); +} + +template <typename TDerived> +const TVector<TRichYPath>& TRawMapReduceOperationIoSpec<TDerived>::GetMapOutputs() const +{ + return MapOutputs_; +} + +//////////////////////////////////////////////////////////////////////////////// + +::TIntrusivePtr<INodeReaderImpl> CreateJobNodeReader(TRawTableReaderPtr rawTableReader); +::TIntrusivePtr<IYaMRReaderImpl> CreateJobYaMRReader(TRawTableReaderPtr rawTableReader); +::TIntrusivePtr<IProtoReaderImpl> CreateJobProtoReader(TRawTableReaderPtr rawTableReader); + +::TIntrusivePtr<INodeWriterImpl> CreateJobNodeWriter(THolder<IProxyOutput> rawTableWriter); +::TIntrusivePtr<IYaMRWriterImpl> CreateJobYaMRWriter(THolder<IProxyOutput> rawTableWriter); +::TIntrusivePtr<IProtoWriterImpl> CreateJobProtoWriter(THolder<IProxyOutput> rawTableWriter); + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +inline ::TIntrusivePtr<typename TRowTraits<T>::IReaderImpl> CreateJobReaderImpl(TRawTableReaderPtr rawTableReader); + +template <> +inline ::TIntrusivePtr<INodeReaderImpl> CreateJobReaderImpl<TNode>(TRawTableReaderPtr rawTableReader) +{ + return CreateJobNodeReader(rawTableReader); +} + +template <> +inline ::TIntrusivePtr<IYaMRReaderImpl> CreateJobReaderImpl<TYaMRRow>(TRawTableReaderPtr rawTableReader) +{ + return CreateJobYaMRReader(rawTableReader); +} + +template <> +inline ::TIntrusivePtr<IProtoReaderImpl> CreateJobReaderImpl<Message>(TRawTableReaderPtr rawTableReader) +{ + return CreateJobProtoReader(rawTableReader); +} + +template <class T> +inline ::TIntrusivePtr<typename TRowTraits<T>::IReaderImpl> CreateJobReaderImpl(TRawTableReaderPtr rawTableReader) +{ + if constexpr (TIsBaseOf<Message, T>::Value || NDetail::TIsProtoOneOf<T>::value) { + return CreateJobProtoReader(rawTableReader); + } else { + static_assert(TDependentFalse<T>, "Unknown row type"); + } +} + +template <class T> +inline TTableReaderPtr<T> CreateJobReader(TRawTableReaderPtr rawTableReader) +{ + return new TTableReader<T>(CreateJobReaderImpl<T>(rawTableReader)); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +TTableWriterPtr<T> CreateJobWriter(THolder<IProxyOutput> rawJobWriter); + +template <> +inline TTableWriterPtr<TNode> CreateJobWriter<TNode>(THolder<IProxyOutput> rawJobWriter) +{ + return new TTableWriter<TNode>(CreateJobNodeWriter(std::move(rawJobWriter))); +} + +template <> +inline TTableWriterPtr<TYaMRRow> CreateJobWriter<TYaMRRow>(THolder<IProxyOutput> rawJobWriter) +{ + return new TTableWriter<TYaMRRow>(CreateJobYaMRWriter(std::move(rawJobWriter))); +} + +template <> +inline TTableWriterPtr<Message> CreateJobWriter<Message>(THolder<IProxyOutput> rawJobWriter) +{ + return new TTableWriter<Message>(CreateJobProtoWriter(std::move(rawJobWriter))); +} + +template <class T, class = void> +struct TProtoWriterCreator; + +template <class T> +struct TProtoWriterCreator<T, std::enable_if_t<TIsBaseOf<Message, T>::Value>> +{ + static TTableWriterPtr<T> Create(::TIntrusivePtr<IProtoWriterImpl> writer) + { + return new TTableWriter<T>(writer); + } +}; + +template <class T> +inline TTableWriterPtr<T> CreateJobWriter(THolder<IProxyOutput> rawJobWriter) +{ + if constexpr (TIsBaseOf<Message, T>::Value) { + return TProtoWriterCreator<T>::Create(CreateJobProtoWriter(std::move(rawJobWriter))); + } else { + static_assert(TDependentFalse<T>, "Unknown row type"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +void TOperationInputSpecBase::AddInput(const TRichYPath& path) +{ + Inputs_.push_back(path); + StructuredInputs_.emplace_back(Structured<T>(path)); +} + +template <class T> +void TOperationInputSpecBase::SetInput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Inputs_, tableIndex, path); + NDetail::Assign(StructuredInputs_, tableIndex, Structured<T>(path)); +} + + +template <class T> +void TOperationOutputSpecBase::AddOutput(const TRichYPath& path) +{ + Outputs_.push_back(path); + StructuredOutputs_.emplace_back(Structured<T>(path)); +} + +template <class T> +void TOperationOutputSpecBase::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + NDetail::Assign(Outputs_, tableIndex, path); + NDetail::Assign(StructuredOutputs_, tableIndex, Structured<T>(path)); +} + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::AddInput(const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "input type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationInputSpecBase::AddInput<T>(path); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::SetInput(size_t tableIndex, const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "input type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationInputSpecBase::SetInput<T>(tableIndex, path); + return *static_cast<TDerived*>(this); +} + + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::AddOutput(const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::AddOutput<T>(path); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class T> +TDerived& TOperationIOSpec<TDerived>::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::SetOutput<T>(tableIndex, path); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddStructuredInput(TStructuredTablePath path) +{ + TOperationInputSpecBase::AddStructuredInput(std::move(path)); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddStructuredOutput(TStructuredTablePath path) +{ + TOperationOutputSpecBase::AddStructuredOutput(std::move(path)); + return *static_cast<TDerived*>(this); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +TVanillaTask& TVanillaTask::AddOutput(const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::AddOutput<T>(path); + return *this; +} + +template <class T> +TVanillaTask& TVanillaTask::SetOutput(size_t tableIndex, const TRichYPath& path) +{ + static_assert(!std::is_same<T, Message>::value, "output type can't be Message, it can only be its strict subtype (see st.yandex-team.ru/YT-7609)"); + TOperationOutputSpecBase::SetOutput<T>(tableIndex, path); + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + +void ResetUseClientProtobuf(const char* methodName); + +} // namespace NDetail + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddProtobufInput_VerySlow_Deprecated(const TRichYPath& path) +{ + NDetail::ResetUseClientProtobuf("AddProtobufInput_VerySlow_Deprecated"); + Inputs_.push_back(path); + StructuredInputs_.emplace_back(TStructuredTablePath(path, TProtobufTableStructure{nullptr})); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +TDerived& TOperationIOSpec<TDerived>::AddProtobufOutput_VerySlow_Deprecated(const TRichYPath& path) +{ + NDetail::ResetUseClientProtobuf("AddProtobufOutput_VerySlow_Deprecated"); + Outputs_.push_back(path); + StructuredOutputs_.emplace_back(TStructuredTablePath(path, TProtobufTableStructure{nullptr})); + return *static_cast<TDerived*>(this); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TRow> +TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::Description() +{ + for (auto i : Indices_) { + Preparer_.InputDescription<TRow>(i); + } + return *this; +} + +template <typename TRow> +TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::Description(bool inferSchema) +{ + for (auto i : Indices_) { + Preparer_.OutputDescription<TRow>(i, inferSchema); + } + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TCont> +TJobOperationPreparer::TInputGroup TJobOperationPreparer::BeginInputGroup(const TCont& indices) +{ + for (auto i : indices) { + ValidateInputTableIndex(i, TStringBuf("BeginInputGroup()")); + } + return TInputGroup(*this, TVector<int>(std::begin(indices), std::end(indices))); +} + +template <typename TCont> +TJobOperationPreparer::TOutputGroup TJobOperationPreparer::BeginOutputGroup(const TCont& indices) +{ + for (auto i : indices) { + ValidateOutputTableIndex(i, TStringBuf("BeginOutputGroup()")); + } + return TOutputGroup(*this, indices); +} + + +template <typename TRow> +TJobOperationPreparer& TJobOperationPreparer::InputDescription(int tableIndex) +{ + ValidateMissingInputDescription(tableIndex); + InputTableDescriptions_[tableIndex] = StructuredTableDescription<TRow>(); + return *this; +} + +template <typename TRow> +TJobOperationPreparer& TJobOperationPreparer::OutputDescription(int tableIndex, bool inferSchema) +{ + ValidateMissingOutputDescription(tableIndex); + OutputTableDescriptions_[tableIndex] = StructuredTableDescription<TRow>(); + if (inferSchema && !OutputSchemas_[tableIndex]) { + OutputSchemas_[tableIndex] = CreateTableSchema<TRow>(); + } + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintMapOutput() +{ + IntermediateMapOutputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::AddMapOutput(const TRichYPath& path) +{ + MapOutputs_.push_back(path); + StructuredMapOutputs_.emplace_back(Structured<TRow>(path)); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceCombinerInput() +{ + IntermediateReduceCombinerInputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceCombinerOutput() +{ + IntermediateReduceCombinerOutputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +template <class TRow> +TDerived& TIntermediateTablesHintSpec<TDerived>::HintReduceInput() +{ + IntermediateReducerInputDescription_ = StructuredTableDescription<TRow>(); + return *static_cast<TDerived*>(this); +} + +template <class TDerived> +const TVector<TStructuredTablePath>& TIntermediateTablesHintSpec<TDerived>::GetStructuredMapOutputs() const +{ + return StructuredMapOutputs_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateMapOutputDescription() const +{ + return IntermediateMapOutputDescription_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReduceCombinerInputDescription() const +{ + return IntermediateReduceCombinerInputDescription_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReduceCombinerOutputDescription() const +{ + return IntermediateReduceCombinerOutputDescription_; +} + +template <class TDerived> +const TMaybe<TTableStructure>& TIntermediateTablesHintSpec<TDerived>::GetIntermediateReducerInputDescription() const +{ + return IntermediateReducerInputDescription_; +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TReducerContext +{ + bool Break = false; + static TReducerContext* Get() { return Singleton<TReducerContext>(); } +}; + +template <class TR, class TW> +inline void IReducer<TR, TW>::Break() +{ + TReducerContext::Get()->Break = true; +} + +template <typename TReader, typename TWriter> +void FeedJobInput( + IMapper<TReader, TWriter>* mapper, + typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl, + TWriter* writer) +{ + using TInputRow = typename TReader::TRowType; + + auto reader = MakeIntrusive<TTableReader<TInputRow>>(readerImpl); + mapper->Do(reader.Get(), writer); +} + +template <typename TReader, typename TWriter> +void FeedJobInput( + IReducer<TReader, TWriter>* reducer, + typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl, + TWriter* writer) +{ + using TInputRow = typename TReader::TRowType; + + auto rangesReader = MakeIntrusive<TTableRangesReader<TInputRow>>(readerImpl); + for (; rangesReader->IsValid(); rangesReader->Next()) { + reducer->Do(&rangesReader->GetRange(), writer); + if (TReducerContext::Get()->Break) { + break; + } + } +} + +template <typename TReader, typename TWriter> +void FeedJobInput( + IAggregatorReducer<TReader, TWriter>* reducer, + typename TRowTraits<typename TReader::TRowType>::IReaderImpl* readerImpl, + TWriter* writer) +{ + using TInputRow = typename TReader::TRowType; + + auto rangesReader = MakeIntrusive<TTableRangesReader<TInputRow>>(readerImpl); + reducer->Do(rangesReader.Get(), writer); +} + +template <class TRawJob> +int RunRawJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + TRawJobContext context(outputTableCount); + + TRawJob job; + job.Load(jobStateStream); + job.Do(context); + return 0; +} + +template <> +inline int RunRawJob<TCommandRawJob>(size_t /* outputTableCount */, IInputStream& /* jobStateStream */) +{ + Y_FAIL(); +} + +template <class TVanillaJob> +int RunVanillaJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + TVanillaJob job; + job.Load(jobStateStream); + + if constexpr (std::is_base_of<IVanillaJob<>, TVanillaJob>::value) { + Y_VERIFY(outputTableCount == 0, "Void vanilla job expects zero 'outputTableCount'"); + job.Do(); + } else { + Y_VERIFY(outputTableCount, "Vanilla job with table writer expects nonzero 'outputTableCount'"); + using TOutputRow = typename TVanillaJob::TWriter::TRowType; + + THolder<IProxyOutput> rawJobWriter; + if (auto customWriter = job.CreateCustomRawJobWriter(outputTableCount)) { + rawJobWriter = std::move(customWriter); + } else { + rawJobWriter = CreateRawJobWriter(outputTableCount); + } + auto writer = CreateJobWriter<TOutputRow>(std::move(rawJobWriter)); + + job.Start(writer.Get()); + job.Do(writer.Get()); + job.Finish(writer.Get()); + + writer->Finish(); + } + return 0; +} + +template <> +inline int RunVanillaJob<TCommandVanillaJob>(size_t /* outputTableCount */, IInputStream& /* jobStateStream */) +{ + Y_FAIL(); +} + +template <class TJob> + requires TIsBaseOf<IStructuredJob, TJob>::Value +int RunJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + using TInputRow = typename TJob::TReader::TRowType; + using TOutputRow = typename TJob::TWriter::TRowType; + + auto job = MakeIntrusive<TJob>(); + job->Load(jobStateStream); + + TRawTableReaderPtr rawJobReader; + if (auto customReader = job->CreateCustomRawJobReader(/*fd*/ 0)) { + rawJobReader = customReader; + } else { + rawJobReader = CreateRawJobReader(/*fd*/ 0); + } + auto readerImpl = CreateJobReaderImpl<TInputRow>(rawJobReader); + + // Many users don't expect to have jobs with empty input so we skip such jobs. + if (!readerImpl->IsValid()) { + return 0; + } + + THolder<IProxyOutput> rawJobWriter; + if (auto customWriter = job->CreateCustomRawJobWriter(outputTableCount)) { + rawJobWriter = std::move(customWriter); + } else { + rawJobWriter = CreateRawJobWriter(outputTableCount); + } + auto writer = CreateJobWriter<TOutputRow>(std::move(rawJobWriter)); + + job->Start(writer.Get()); + FeedJobInput(job.Get(), readerImpl.Get(), writer.Get()); + job->Finish(writer.Get()); + + writer->Finish(); + + return 0; +} + +// +// We leave RunMapJob/RunReduceJob/RunAggregatorReducer for backward compatibility, +// some user use them already. :( + +template <class TMapper> +int RunMapJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + return RunJob<TMapper>(outputTableCount, jobStateStream); +} + +template <class TReducer> +int RunReduceJob(size_t outputTableCount, IInputStream& jobStateStream) +{ + return RunJob<TReducer>(outputTableCount, jobStateStream); +} + +template <class TReducer> +int RunAggregatorReducer(size_t outputTableCount, IInputStream& jobStateStream) +{ + return RunJob<TReducer>(outputTableCount, jobStateStream); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename T, typename = void> +struct TIsConstructibleFromNode + : std::false_type +{ }; + +template <typename T> +struct TIsConstructibleFromNode<T, std::void_t<decltype(T::FromNode(std::declval<TNode&>()))>> + : std::true_type +{ }; + +template <class TJob> +::TIntrusivePtr<NYT::IStructuredJob> ConstructJobFromNode(const TNode& node) +{ + if constexpr (TIsConstructibleFromNode<TJob>::value) { + Y_ENSURE(node.GetType() != TNode::Undefined, + "job has FromNode method but constructor arguments were not provided"); + return TJob::FromNode(node); + } else { + Y_ENSURE(node.GetType() == TNode::Undefined, + "constructor arguments provided but job does not contain FromNode method"); + return MakeIntrusive<TJob>(); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +using TJobFunction = int (*)(size_t, IInputStream&); +using TConstructJobFunction = ::TIntrusivePtr<NYT::IStructuredJob> (*)(const TNode&); + +class TJobFactory +{ +public: + static TJobFactory* Get() + { + return Singleton<TJobFactory>(); + } + + template <class TJob> + void RegisterJob(const char* name) + { + RegisterJobImpl<TJob>(name, RunJob<TJob>); + JobConstructors[name] = ConstructJobFromNode<TJob>; + } + + template <class TRawJob> + void RegisterRawJob(const char* name) + { + RegisterJobImpl<TRawJob>(name, RunRawJob<TRawJob>); + } + + template <class TVanillaJob> + void RegisterVanillaJob(const char* name) + { + RegisterJobImpl<TVanillaJob>(name, RunVanillaJob<TVanillaJob>); + } + + TString GetJobName(const IJob* job) + { + const auto typeIndex = std::type_index(typeid(*job)); + CheckJobRegistered(typeIndex); + return JobNames[typeIndex]; + } + + TJobFunction GetJobFunction(const char* name) + { + CheckNameRegistered(name); + return JobFunctions[name]; + } + + TConstructJobFunction GetConstructingFunction(const char* name) + { + CheckNameRegistered(name); + return JobConstructors[name]; + } + +private: + TMap<std::type_index, TString> JobNames; + THashMap<TString, TJobFunction> JobFunctions; + THashMap<TString, TConstructJobFunction> JobConstructors; + + template <typename TJob, typename TRunner> + void RegisterJobImpl(const char* name, TRunner runner) { + const auto typeIndex = std::type_index(typeid(TJob)); + CheckNotRegistered(typeIndex, name); + JobNames[typeIndex] = name; + JobFunctions[name] = runner; + } + + void CheckNotRegistered(const std::type_index& typeIndex, const char* name) + { + Y_ENSURE(!JobNames.contains(typeIndex), + "type_info '" << typeIndex.name() << "'" + "is already registered under name '" << JobNames[typeIndex] << "'"); + Y_ENSURE(!JobFunctions.contains(name), + "job with name '" << name << "' is already registered"); + } + + void CheckJobRegistered(const std::type_index& typeIndex) + { + Y_ENSURE(JobNames.contains(typeIndex), + "type_info '" << typeIndex.name() << "' is not registered, use REGISTER_* macros"); + } + + void CheckNameRegistered(const char* name) + { + Y_ENSURE(JobFunctions.contains(name), + "job with name '" << name << "' is not registered, use REGISTER_* macros"); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +template <class TMapper> +struct TMapperRegistrator +{ + TMapperRegistrator(const char* name) + { + static_assert(TMapper::JobType == IJob::EType::Mapper, + "REGISTER_MAPPER is not compatible with this job class"); + + NYT::TJobFactory::Get()->RegisterJob<TMapper>(name); + } +}; + +template <class TReducer> +struct TReducerRegistrator +{ + TReducerRegistrator(const char* name) + { + static_assert(TReducer::JobType == IJob::EType::Reducer || + TReducer::JobType == IJob::EType::ReducerAggregator, + "REGISTER_REDUCER is not compatible with this job class"); + + NYT::TJobFactory::Get()->RegisterJob<TReducer>(name); + } +}; + +template <class TRawJob> +struct TRawJobRegistrator +{ + TRawJobRegistrator(const char* name) + { + static_assert(TRawJob::JobType == IJob::EType::RawJob, + "REGISTER_RAW_JOB is not compatible with this job class"); + NYT::TJobFactory::Get()->RegisterRawJob<TRawJob>(name); + } +}; + +template <class TVanillaJob> +struct TVanillaJobRegistrator +{ + TVanillaJobRegistrator(const char* name) + { + static_assert(TVanillaJob::JobType == IJob::EType::VanillaJob, + "REGISTER_VANILLA_JOB is not compatible with this job class"); + NYT::TJobFactory::Get()->RegisterVanillaJob<TVanillaJob>(name); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +inline TString YtRegistryTypeName(const TString& name) { + TString res = name; +#ifdef _win_ + SubstGlobal(res, "class ", ""); +#endif + return res; +} + +//////////////////////////////////////////////////////////////////////////////// + +#define REGISTER_MAPPER(...) \ +static const NYT::TMapperRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()); + +#define REGISTER_NAMED_MAPPER(name, ...) \ +static const NYT::TMapperRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_REDUCER(...) \ +static const NYT::TReducerRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()); + +#define REGISTER_NAMED_REDUCER(name, ...) \ +static const NYT::TReducerRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_NAMED_RAW_JOB(name, ...) \ +static const NYT::TRawJobRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_RAW_JOB(...) \ +REGISTER_NAMED_RAW_JOB((NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()), __VA_ARGS__) + +#define REGISTER_NAMED_VANILLA_JOB(name, ...) \ +static NYT::TVanillaJobRegistrator<__VA_ARGS__> \ +Y_GENERATE_UNIQUE_ID(TJobRegistrator)(name); + +#define REGISTER_VANILLA_JOB(...) \ +REGISTER_NAMED_VANILLA_JOB((NYT::YtRegistryTypeName(TypeName<__VA_ARGS__>()).data()), __VA_ARGS__) + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IMapper<TReader, TWriter>::GetInputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>(); +} + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IMapper<TReader, TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IReducer<TReader, TWriter>::GetInputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>(); +} + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IReducer<TReader, TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IAggregatorReducer<TReader, TWriter>::GetInputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TReader::TRowType>(); +} + +template <typename TReader, typename TWriter> +TStructuredRowStreamDescription IAggregatorReducer<TReader, TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +template <typename TWriter> +TStructuredRowStreamDescription IVanillaJob<TWriter>::GetInputRowStreamDescription() const +{ + return TVoidStructuredRowStream(); +} + +template <typename TWriter> +TStructuredRowStreamDescription IVanillaJob<TWriter>::GetOutputRowStreamDescription() const +{ + return NYT::NDetail::GetStructuredRowStreamDescription<typename TWriter::TRowType>(); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/operation.cpp b/yt/cpp/mapreduce/interface/operation.cpp new file mode 100644 index 0000000000..706fc4caa4 --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation.cpp @@ -0,0 +1,663 @@ +#include "operation.h" + +#include <util/generic/iterator_range.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +namespace NDetail { + i64 OutputTableCount = -1; +} // namespace NDetail + +//////////////////////////////////////////////////////////////////////////////// + +TTaskName::TTaskName(TString taskName) + : TaskName_(std::move(taskName)) +{ } + +TTaskName::TTaskName(const char* taskName) + : TaskName_(taskName) +{ } + +TTaskName::TTaskName(ETaskName taskName) + : TaskName_(ToString(taskName)) +{ } + +const TString& TTaskName::Get() const +{ + return TaskName_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TCommandRawJob::TCommandRawJob(TStringBuf command) + : Command_(command) +{ } + +const TString& TCommandRawJob::GetCommand() const +{ + return Command_; +} + +void TCommandRawJob::Do(const TRawJobContext& /* jobContext */) +{ + Y_FAIL("TCommandRawJob::Do must not be called"); +} + +REGISTER_NAMED_RAW_JOB("NYT::TCommandRawJob", TCommandRawJob) + +//////////////////////////////////////////////////////////////////////////////// + +TCommandVanillaJob::TCommandVanillaJob(TStringBuf command) + : Command_(command) +{ } + +const TString& TCommandVanillaJob::GetCommand() const +{ + return Command_; +} + +void TCommandVanillaJob::Do() +{ + Y_FAIL("TCommandVanillaJob::Do must not be called"); +} + +REGISTER_NAMED_VANILLA_JOB("NYT::TCommandVanillaJob", TCommandVanillaJob); + +//////////////////////////////////////////////////////////////////////////////// + +bool operator==(const TUnspecifiedTableStructure&, const TUnspecifiedTableStructure&) +{ + return true; +} + +bool operator==(const TProtobufTableStructure& lhs, const TProtobufTableStructure& rhs) +{ + return lhs.Descriptor == rhs.Descriptor; +} + +//////////////////////////////////////////////////////////////////////////////// + +const TVector<TStructuredTablePath>& TOperationInputSpecBase::GetStructuredInputs() const +{ + return StructuredInputs_; +} + +const TVector<TStructuredTablePath>& TOperationOutputSpecBase::GetStructuredOutputs() const +{ + return StructuredOutputs_; +} + +void TOperationInputSpecBase::AddStructuredInput(TStructuredTablePath path) +{ + Inputs_.push_back(path.RichYPath); + StructuredInputs_.push_back(std::move(path)); +} + +void TOperationOutputSpecBase::AddStructuredOutput(TStructuredTablePath path) +{ + Outputs_.push_back(path.RichYPath); + StructuredOutputs_.push_back(std::move(path)); +} + +//////////////////////////////////////////////////////////////////////////////// + +TVanillaTask& TVanillaTask::AddStructuredOutput(TStructuredTablePath path) +{ + TOperationOutputSpecBase::AddStructuredOutput(std::move(path)); + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +TStructuredRowStreamDescription IVanillaJob<void>::GetInputRowStreamDescription() const +{ + return TVoidStructuredRowStream(); +} + +TStructuredRowStreamDescription IVanillaJob<void>::GetOutputRowStreamDescription() const +{ + return TVoidStructuredRowStream(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TRawJobContext::TRawJobContext(size_t outputTableCount) + : InputFile_(Duplicate(0)) +{ + for (size_t i = 0; i != outputTableCount; ++i) { + OutputFileList_.emplace_back(Duplicate(3 * i + 1)); + } +} + +const TFile& TRawJobContext::GetInputFile() const +{ + return InputFile_; +} + +const TVector<TFile>& TRawJobContext::GetOutputFileList() const +{ + return OutputFileList_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TUserJobSpec& TUserJobSpec::AddLocalFile( + const TLocalFilePath& path, + const TAddLocalFileOptions& options) +{ + LocalFiles_.emplace_back(path, options); + return *this; +} + +TUserJobSpec& TUserJobSpec::JobBinaryLocalPath(TString path, TMaybe<TString> md5) +{ + JobBinary_ = TJobBinaryLocalPath{path, md5}; + return *this; +} + +TUserJobSpec& TUserJobSpec::JobBinaryCypressPath(TString path, TMaybe<TTransactionId> transactionId) +{ + JobBinary_ = TJobBinaryCypressPath{path, transactionId}; + return *this; +} + +const TJobBinaryConfig& TUserJobSpec::GetJobBinary() const +{ + return JobBinary_; +} + +TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> TUserJobSpec::GetLocalFiles() const +{ + return LocalFiles_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TJobOperationPreparer::TInputGroup::TInputGroup(TJobOperationPreparer& preparer, TVector<int> indices) + : Preparer_(preparer) + , Indices_(std::move(indices)) +{ } + +TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::ColumnRenaming(const THashMap<TString, TString>& renaming) +{ + for (auto i : Indices_) { + Preparer_.InputColumnRenaming(i, renaming); + } + return *this; +} + +TJobOperationPreparer::TInputGroup& TJobOperationPreparer::TInputGroup::ColumnFilter(const TVector<TString>& columns) +{ + for (auto i : Indices_) { + Preparer_.InputColumnFilter(i, columns); + } + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::TInputGroup::EndInputGroup() +{ + return Preparer_; +} + +TJobOperationPreparer::TOutputGroup::TOutputGroup(TJobOperationPreparer& preparer, TVector<int> indices) + : Preparer_(preparer) + , Indices_(std::move(indices)) +{ } + +TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::Schema(const TTableSchema &schema) +{ + for (auto i : Indices_) { + Preparer_.OutputSchema(i, schema); + } + return *this; +} + +TJobOperationPreparer::TOutputGroup& TJobOperationPreparer::TOutputGroup::NoSchema() +{ + for (auto i : Indices_) { + Preparer_.NoOutputSchema(i); + } + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::TOutputGroup::EndOutputGroup() +{ + return Preparer_; +} + +//////////////////////////////////////////////////////////////////////////////// + +TJobOperationPreparer::TJobOperationPreparer(const IOperationPreparationContext& context) + : Context_(context) + , OutputSchemas_(context.GetOutputCount()) + , InputColumnRenamings_(context.GetInputCount()) + , InputColumnFilters_(context.GetInputCount()) + , InputTableDescriptions_(context.GetInputCount()) + , OutputTableDescriptions_(context.GetOutputCount()) +{ } + +TJobOperationPreparer::TInputGroup TJobOperationPreparer::BeginInputGroup(int begin, int end) +{ + Y_ENSURE_EX(begin <= end, TApiUsageError() + << "BeginInputGroup(): begin must not exceed end, got " << begin << ", " << end); + TVector<int> indices; + for (int i = begin; i < end; ++i) { + ValidateInputTableIndex(i, TStringBuf("BeginInputGroup()")); + indices.push_back(i); + } + return TInputGroup(*this, std::move(indices)); +} + + +TJobOperationPreparer::TOutputGroup TJobOperationPreparer::BeginOutputGroup(int begin, int end) +{ + Y_ENSURE_EX(begin <= end, TApiUsageError() + << "BeginOutputGroup(): begin must not exceed end, got " << begin << ", " << end); + TVector<int> indices; + for (int i = begin; i < end; ++i) { + ValidateOutputTableIndex(i, TStringBuf("BeginOutputGroup()")); + indices.push_back(i); + } + return TOutputGroup(*this, std::move(indices)); +} + +TJobOperationPreparer& TJobOperationPreparer::NodeOutput(int tableIndex) +{ + ValidateMissingOutputDescription(tableIndex); + OutputTableDescriptions_[tableIndex] = StructuredTableDescription<TNode>(); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::OutputSchema(int tableIndex, TTableSchema schema) +{ + ValidateMissingOutputSchema(tableIndex); + OutputSchemas_[tableIndex] = std::move(schema); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::NoOutputSchema(int tableIndex) +{ + ValidateMissingOutputSchema(tableIndex); + OutputSchemas_[tableIndex] = EmptyNonstrictSchema(); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::InputColumnRenaming( + int tableIndex, + const THashMap<TString,TString>& renaming) +{ + ValidateInputTableIndex(tableIndex, TStringBuf("InputColumnRenaming()")); + InputColumnRenamings_[tableIndex] = renaming; + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::InputColumnFilter(int tableIndex, const TVector<TString>& columns) +{ + ValidateInputTableIndex(tableIndex, TStringBuf("InputColumnFilter()")); + InputColumnFilters_[tableIndex] = columns; + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::FormatHints(TUserJobFormatHints newFormatHints) +{ + FormatHints_ = newFormatHints; + return *this; +} + +void TJobOperationPreparer::Finish() +{ + FinallyValidate(); +} + +TVector<TTableSchema> TJobOperationPreparer::GetOutputSchemas() +{ + TVector<TTableSchema> result; + result.reserve(OutputSchemas_.size()); + for (auto& schema : OutputSchemas_) { + Y_VERIFY(schema.Defined()); + result.push_back(std::move(*schema)); + schema.Clear(); + } + return result; +} + +void TJobOperationPreparer::FinallyValidate() const +{ + TVector<int> illegallyMissingSchemaIndices; + for (int i = 0; i < static_cast<int>(OutputSchemas_.size()); ++i) { + if (!OutputSchemas_[i]) { + illegallyMissingSchemaIndices.push_back(i); + } + } + if (illegallyMissingSchemaIndices.empty()) { + return; + } + TApiUsageError error; + error << "Output table schemas are missing: "; + for (auto i : illegallyMissingSchemaIndices) { + error << "no. " << i; + if (auto path = Context_.GetInputPath(i)) { + error << "(" << *path << ")"; + } + error << "; "; + } + ythrow std::move(error); +} + +//////////////////////////////////////////////////////////////////////////////// + +void TJobOperationPreparer::ValidateInputTableIndex(int tableIndex, TStringBuf message) const +{ + Y_ENSURE_EX( + 0 <= tableIndex && tableIndex < static_cast<int>(Context_.GetInputCount()), + TApiUsageError() << + message << ": input table index " << tableIndex << " us out of range [0;" << + OutputSchemas_.size() << ")"); +} + +void TJobOperationPreparer::ValidateOutputTableIndex(int tableIndex, TStringBuf message) const +{ + Y_ENSURE_EX( + 0 <= tableIndex && tableIndex < static_cast<int>(Context_.GetOutputCount()), + TApiUsageError() << + message << ": output table index " << tableIndex << " us out of range [0;" << + OutputSchemas_.size() << ")"); +} + +void TJobOperationPreparer::ValidateMissingOutputSchema(int tableIndex) const +{ + ValidateOutputTableIndex(tableIndex, "ValidateMissingOutputSchema()"); + Y_ENSURE_EX(!OutputSchemas_[tableIndex], + TApiUsageError() << + "Output table schema no. " << tableIndex << " " << + "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " << + "is already set"); +} + +void TJobOperationPreparer::ValidateMissingInputDescription(int tableIndex) const +{ + ValidateInputTableIndex(tableIndex, "ValidateMissingInputDescription()"); + Y_ENSURE_EX(!InputTableDescriptions_[tableIndex], + TApiUsageError() << + "Description for input no. " << tableIndex << " " << + "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " << + "is already set"); +} + +void TJobOperationPreparer::ValidateMissingOutputDescription(int tableIndex) const +{ + ValidateOutputTableIndex(tableIndex, "ValidateMissingOutputDescription()"); + Y_ENSURE_EX(!OutputTableDescriptions_[tableIndex], + TApiUsageError() << + "Description for output no. " << tableIndex << " " << + "(" << Context_.GetOutputPath(tableIndex).GetOrElse("<unknown path>") << ") " << + "is already set"); +} + +TTableSchema TJobOperationPreparer::EmptyNonstrictSchema() { + return TTableSchema().Strict(false); +} + +//////////////////////////////////////////////////////////////////////////////// + +const TVector<THashMap<TString, TString>>& TJobOperationPreparer::GetInputColumnRenamings() const +{ + return InputColumnRenamings_; +} + +const TVector<TMaybe<TVector<TString>>>& TJobOperationPreparer::GetInputColumnFilters() const +{ + return InputColumnFilters_; +} + +const TVector<TMaybe<TTableStructure>>& TJobOperationPreparer::GetInputDescriptions() const +{ + return InputTableDescriptions_; +} + +const TVector<TMaybe<TTableStructure>>& TJobOperationPreparer::GetOutputDescriptions() const +{ + return OutputTableDescriptions_; +} + +const TUserJobFormatHints& TJobOperationPreparer::GetFormatHints() const +{ + return FormatHints_; +} + +TJobOperationPreparer& TJobOperationPreparer::InputFormatHints(TFormatHints hints) +{ + FormatHints_.InputFormatHints(hints); + return *this; +} + +TJobOperationPreparer& TJobOperationPreparer::OutputFormatHints(TFormatHints hints) +{ + FormatHints_.OutputFormatHints(hints); + return *this; +} + +//////////////////////////////////////////////////////////////////////////////// + +void IJob::PrepareOperation(const IOperationPreparationContext& context, TJobOperationPreparer& resultBuilder) const +{ + for (int i = 0; i < context.GetOutputCount(); ++i) { + resultBuilder.NoOutputSchema(i); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +IOperationPtr IOperationClient::Map( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + const TOperationOptions& options) +{ + Y_VERIFY(mapper.Get()); + + return DoMap( + spec, + std::move(mapper), + options); +} + +IOperationPtr IOperationClient::Map( + ::TIntrusivePtr<IMapperBase> mapper, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TMapOperationSpec& spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TMapOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TMapOperationSpec::Outputs MUST be empty"); + + auto mapSpec = spec; + for (const auto& inputPath : input.Parts_) { + mapSpec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + mapSpec.AddStructuredOutput(outputPath); + } + return Map(mapSpec, std::move(mapper), options); +} + +IOperationPtr IOperationClient::Reduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoReduce( + spec, + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::Reduce( + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + const TReduceOperationSpec& spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TReduceOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TReduceOperationSpec::Outputs MUST be empty"); + Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(), + TApiUsageError() << "TReduceOperationSpec::ReduceBy MUST be empty"); + + auto reduceSpec = spec; + for (const auto& inputPath : input.Parts_) { + reduceSpec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + reduceSpec.AddStructuredOutput(outputPath); + } + reduceSpec.ReduceBy(reduceBy); + return Reduce(reduceSpec, std::move(reducer), options); +} + +IOperationPtr IOperationClient::JoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoJoinReduce( + spec, + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoMapReduce( + spec, + std::move(mapper), + nullptr, + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options) +{ + Y_VERIFY(reducer.Get()); + + return DoMapReduce( + spec, + std::move(mapper), + std::move(reduceCombiner), + std::move(reducer), + options); +} + +IOperationPtr IOperationClient::MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Outputs MUST be empty"); + Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::ReduceBy MUST be empty"); + + for (const auto& inputPath : input.Parts_) { + spec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + spec.AddStructuredOutput(outputPath); + } + spec.ReduceBy(reduceBy); + return MapReduce(spec, std::move(mapper), std::move(reducer), options); +} + +IOperationPtr IOperationClient::MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Outputs_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::Outputs MUST be empty"); + Y_ENSURE_EX(spec.ReduceBy_.Parts_.empty(), + TApiUsageError() << "TMapReduceOperationSpec::ReduceBy MUST be empty"); + + for (const auto& inputPath : input.Parts_) { + spec.AddStructuredInput(inputPath); + } + for (const auto& outputPath : output.Parts_) { + spec.AddStructuredOutput(outputPath); + } + spec.ReduceBy(reduceBy); + return MapReduce(spec, std::move(mapper), std::move(reduceCombiner), std::move(reducer), options); +} + +IOperationPtr IOperationClient::Sort( + const TOneOrMany<TRichYPath>& input, + const TRichYPath& output, + const TSortColumns& sortBy, + const TSortOperationSpec& spec, + const TOperationOptions& options) +{ + Y_ENSURE_EX(spec.Inputs_.empty(), + TApiUsageError() << "TSortOperationSpec::Inputs MUST be empty"); + Y_ENSURE_EX(spec.Output_.Path_.empty(), + TApiUsageError() << "TSortOperationSpec::Output MUST be empty"); + Y_ENSURE_EX(spec.SortBy_.Parts_.empty(), + TApiUsageError() << "TSortOperationSpec::SortBy MUST be empty"); + + auto sortSpec = spec; + for (const auto& inputPath : input.Parts_) { + sortSpec.AddInput(inputPath); + } + sortSpec.Output(output); + sortSpec.SortBy(sortBy); + return Sort(sortSpec, options); +} + +//////////////////////////////////////////////////////////////////////////////// + +TRawTableReaderPtr IStructuredJob::CreateCustomRawJobReader(int) const +{ + return nullptr; +} + +THolder<IProxyOutput> IStructuredJob::CreateCustomRawJobWriter(size_t) const +{ + return nullptr; +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/operation.h b/yt/cpp/mapreduce/interface/operation.h new file mode 100644 index 0000000000..171a7e4af7 --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation.h @@ -0,0 +1,3494 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/operation.h +/// +/// Header containing interface to run operations in YT +/// and retrieve information about them. +/// @see [the doc](https://yt.yandex-team.ru/docs/description/mr/map_reduce_overview.html). + +#include "client_method_options.h" +#include "errors.h" +#include "io.h" +#include "job_statistics.h" +#include "job_counters.h" + +#include <library/cpp/threading/future/future.h> +#include <library/cpp/type_info/type_info.h> + +#include <util/datetime/base.h> +#include <util/generic/variant.h> +#include <util/generic/vector.h> +#include <util/generic/maybe.h> +#include <util/system/file.h> +#include <util/system/types.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// Tag class marking that the row type for table is not specified. +struct TUnspecifiedTableStructure +{ }; + +/// Tag class marking that table rows have protobuf type. +struct TProtobufTableStructure +{ + /// @brief Descriptor of the protobuf type of table rows. + /// + /// @note If table is tagged with @ref ::google::protobuf::Message instead of real proto class + /// this descriptor might be null. + const ::google::protobuf::Descriptor* Descriptor = nullptr; +}; + + +/// Tag class to specify table row type. +using TTableStructure = std::variant< + TUnspecifiedTableStructure, + TProtobufTableStructure +>; + +bool operator==(const TUnspecifiedTableStructure&, const TUnspecifiedTableStructure&); +bool operator==(const TProtobufTableStructure& lhs, const TProtobufTableStructure& rhs); + +/// Table path marked with @ref NYT::TTableStructure tag. +struct TStructuredTablePath +{ + TStructuredTablePath(TRichYPath richYPath = TRichYPath(), TTableStructure description = TUnspecifiedTableStructure()) + : RichYPath(std::move(richYPath)) + , Description(std::move(description)) + { } + + TStructuredTablePath(TRichYPath richYPath, const ::google::protobuf::Descriptor* descriptor) + : RichYPath(std::move(richYPath)) + , Description(TProtobufTableStructure({descriptor})) + { } + + TStructuredTablePath(TYPath path) + : RichYPath(std::move(path)) + , Description(TUnspecifiedTableStructure()) + { } + + TStructuredTablePath(const char* path) + : RichYPath(path) + , Description(TUnspecifiedTableStructure()) + { } + + TRichYPath RichYPath; + TTableStructure Description; +}; + +/// Create marked table path from row type. +template <typename TRow> +TStructuredTablePath Structured(TRichYPath richYPath); + +/// Create tag class from row type. +template <typename TRow> +TTableStructure StructuredTableDescription(); + +/////////////////////////////////////////////////////////////////////////////// + +/// Tag class marking that row stream is empty. +struct TVoidStructuredRowStream +{ }; + +/// Tag class marking that row stream consists of `NYT::TNode`. +struct TTNodeStructuredRowStream +{ }; + +/// Tag class marking that row stream consists of @ref NYT::TYaMRRow. +struct TTYaMRRowStructuredRowStream +{ }; + +/// Tag class marking that row stream consists of protobuf rows of given type. +struct TProtobufStructuredRowStream +{ + /// @brief Descriptor of the protobuf type of table rows. + /// + /// @note If `Descriptor` is nullptr, then row stream consists of multiple message types. + const ::google::protobuf::Descriptor* Descriptor = nullptr; +}; + +/// Tag class to specify type of rows in an operation row stream +using TStructuredRowStreamDescription = std::variant< + TVoidStructuredRowStream, + TTNodeStructuredRowStream, + TTYaMRRowStructuredRowStream, + TProtobufStructuredRowStream +>; + +/////////////////////////////////////////////////////////////////////////////// + +/// Tag class marking that current binary should be used in operation. +struct TJobBinaryDefault +{ }; + +/// Tag class marking that binary from specified local path should be used in operation. +struct TJobBinaryLocalPath +{ + TString Path; + TMaybe<TString> MD5CheckSum; +}; + +/// Tag class marking that binary from specified Cypress path should be used in operation. +struct TJobBinaryCypressPath +{ + TYPath Path; + TMaybe<TTransactionId> TransactionId; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +/// @cond Doxygen_Suppress +namespace NDetail { + extern i64 OutputTableCount; +} // namespace NDetail +/// @endcond + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Auto merge mode. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/automerge +enum class EAutoMergeMode +{ + /// Auto merge is disabled. + Disabled /* "disabled" */, + + /// Mode that tries to achieve good chunk sizes and doesn't limit usage of chunk quota for intermediate chunks. + Relaxed /* "relaxed" */, + + /// Mode that tries to optimize usage of chunk quota for intermediate chunks, operation might run slower. + Economy /* "economy" */, + + /// + /// @brief Manual configuration of automerge parameters. + /// + /// @ref TAutoMergeSpec + Manual /* "manual" */, +}; + +/// +/// @brief Options for auto merge operation stage. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/automerge +class TAutoMergeSpec +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TAutoMergeSpec; + /// @endcond + + /// Mode of the auto merge. + FLUENT_FIELD_OPTION(EAutoMergeMode, Mode); + + /// @brief Upper limit for number of intermediate chunks. + /// + /// Works only for Manual mode. + FLUENT_FIELD_OPTION(i64, MaxIntermediateChunkCount); + + /// @brief Number of chunks limit to merge in one job. + /// + /// Works only for Manual mode. + FLUENT_FIELD_OPTION(i64, ChunkCountPerMergeJob); + + /// @brief Automerge will not merge chunks that are larger than `DesiredChunkSize * (ChunkSizeThreshold / 100.)` + /// + /// Works only for Manual mode. + FLUENT_FIELD_OPTION(i64, ChunkSizeThreshold); +}; + +/// Base for operations with auto merge options. +template <class TDerived> +class TWithAutoMergeSpec +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Options for auto merge operation stage. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/automerge + FLUENT_FIELD_OPTION(TAutoMergeSpec, AutoMerge); +}; + +/// +/// @brief Resources controlled by scheduler and used by running operations. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/scheduler/scheduler_and_pools#resursy +class TSchedulerResources +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TSchedulerResources; + /// @endcond + + /// Each job consumes exactly one user slot. + FLUENT_FIELD_OPTION_ENCAPSULATED(i64, UserSlots); + + /// Number of (virtual) cpu cores consumed by all jobs. + FLUENT_FIELD_OPTION_ENCAPSULATED(i64, Cpu); + + /// Amount of memory in bytes. + FLUENT_FIELD_OPTION_ENCAPSULATED(i64, Memory); +}; + +/// Base for input format hints of a user job. +template <class TDerived> +class TUserJobInputFormatHintsBase +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Fine tune input format of the job. + FLUENT_FIELD_OPTION(TFormatHints, InputFormatHints); +}; + +/// Base for output format hints of a user job. +template <class TDerived> +class TUserJobOutputFormatHintsBase +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Fine tune output format of the job. + FLUENT_FIELD_OPTION(TFormatHints, OutputFormatHints); +}; + +/// Base for format hints of a user job. +template <class TDerived> +class TUserJobFormatHintsBase + : public TUserJobInputFormatHintsBase<TDerived> + , public TUserJobOutputFormatHintsBase<TDerived> +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond +}; + +/// User job format hints. +class TUserJobFormatHints + : public TUserJobFormatHintsBase<TUserJobFormatHints> +{ }; + +/// Spec of input and output tables of a raw operation. +template <class TDerived> +class TRawOperationIoTableSpec +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// Add input table path to input path list. + TDerived& AddInput(const TRichYPath& path); + + /// Set input table path no. `tableIndex`. + TDerived& SetInput(size_t tableIndex, const TRichYPath& path); + + /// Add output table path to output path list. + TDerived& AddOutput(const TRichYPath& path); + + /// Set output table path no. `tableIndex`. + TDerived& SetOutput(size_t tableIndex, const TRichYPath& path); + + /// Get all input table paths. + const TVector<TRichYPath>& GetInputs() const; + + /// Get all output table paths. + const TVector<TRichYPath>& GetOutputs() const; + +private: + TVector<TRichYPath> Inputs_; + TVector<TRichYPath> Outputs_; +}; + +/// Base spec for IO in "simple" raw operations (Map, Reduce etc.). +template <class TDerived> +struct TSimpleRawOperationIoSpec + : public TRawOperationIoTableSpec<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Describes format for both input and output. + /// + /// @note `Format' is overriden by `InputFormat' and `OutputFormat'. + FLUENT_FIELD_OPTION(TFormat, Format); + + /// Describes input format. + FLUENT_FIELD_OPTION(TFormat, InputFormat); + + /// Describes output format. + FLUENT_FIELD_OPTION(TFormat, OutputFormat); +}; + +/// Spec for IO in MapReduce operation. +template <class TDerived> +class TRawMapReduceOperationIoSpec + : public TRawOperationIoTableSpec<TDerived> +{ +public: + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// @brief Describes format for both input and output of mapper. + /// + /// @note `MapperFormat' is overriden by `MapperInputFormat' and `MapperOutputFormat'. + FLUENT_FIELD_OPTION(TFormat, MapperFormat); + + /// Describes mapper input format. + FLUENT_FIELD_OPTION(TFormat, MapperInputFormat); + + /// Describes mapper output format. + FLUENT_FIELD_OPTION(TFormat, MapperOutputFormat); + + /// @brief Describes format for both input and output of reduce combiner. + /// + /// @note `ReduceCombinerFormat' is overriden by `ReduceCombinerInputFormat' and `ReduceCombinerOutputFormat'. + FLUENT_FIELD_OPTION(TFormat, ReduceCombinerFormat); + + /// Describes reduce combiner input format. + FLUENT_FIELD_OPTION(TFormat, ReduceCombinerInputFormat); + + /// Describes reduce combiner output format. + FLUENT_FIELD_OPTION(TFormat, ReduceCombinerOutputFormat); + + /// @brief Describes format for both input and output of reducer. + /// + /// @note `ReducerFormat' is overriden by `ReducerInputFormat' and `ReducerOutputFormat'. + FLUENT_FIELD_OPTION(TFormat, ReducerFormat); + + /// Describes reducer input format. + FLUENT_FIELD_OPTION(TFormat, ReducerInputFormat); + + /// Describes reducer output format. + FLUENT_FIELD_OPTION(TFormat, ReducerOutputFormat); + + /// Add direct map output table path. + TDerived& AddMapOutput(const TRichYPath& path); + + /// Set direct map output table path no. `tableIndex`. + TDerived& SetMapOutput(size_t tableIndex, const TRichYPath& path); + + /// Get all direct map output table paths + const TVector<TRichYPath>& GetMapOutputs() const; + +private: + TVector<TRichYPath> MapOutputs_; +}; + +/// +/// @brief Base spec of operations with input tables. +class TOperationInputSpecBase +{ +public: + template <class T, class = void> + struct TFormatAdder; + + /// + /// @brief Add input table path to input path list and specify type of rows. + template <class T> + void AddInput(const TRichYPath& path); + + /// + /// @brief Add input table path as structured paths. + void AddStructuredInput(TStructuredTablePath path); + + /// + /// @brief Set input table path and type. + template <class T> + void SetInput(size_t tableIndex, const TRichYPath& path); + + /// + /// @brief All input paths. + TVector<TRichYPath> Inputs_; + + /// + /// @brief Get all input structured paths. + const TVector<TStructuredTablePath>& GetStructuredInputs() const; + +private: + TVector<TStructuredTablePath> StructuredInputs_; + friend struct TOperationIOSpecBase; + template <class T> + friend struct TOperationIOSpec; +}; + +/// +/// @brief Base spec of operations with output tables. +class TOperationOutputSpecBase +{ +public: + template <class T, class = void> + struct TFormatAdder; + + /// + /// @brief Add output table path to output path list and specify type of rows. + template <class T> + void AddOutput(const TRichYPath& path); + + /// + /// @brief Add output table path as structured paths. + void AddStructuredOutput(TStructuredTablePath path); + + /// + /// @brief Set output table path and type. + template <class T> + void SetOutput(size_t tableIndex, const TRichYPath& path); + + /// + /// @brief All output paths. + TVector<TRichYPath> Outputs_; + + /// + /// @brief Get all output structured paths. + const TVector<TStructuredTablePath>& GetStructuredOutputs() const; + +private: + TVector<TStructuredTablePath> StructuredOutputs_; + friend struct TOperationIOSpecBase; + template <class T> + friend struct TOperationIOSpec; +}; + +/// +/// @brief Base spec for operations with inputs and outputs. +struct TOperationIOSpecBase + : public TOperationInputSpecBase + , public TOperationOutputSpecBase +{ }; + +/// +/// @brief Base spec for operations with inputs and outputs. +template <class TDerived> +struct TOperationIOSpec + : public TOperationIOSpecBase +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + template <class T> + TDerived& AddInput(const TRichYPath& path); + + TDerived& AddStructuredInput(TStructuredTablePath path); + + template <class T> + TDerived& SetInput(size_t tableIndex, const TRichYPath& path); + + template <class T> + TDerived& AddOutput(const TRichYPath& path); + + TDerived& AddStructuredOutput(TStructuredTablePath path); + + template <class T> + TDerived& SetOutput(size_t tableIndex, const TRichYPath& path); + + + // DON'T USE THESE METHODS! They are left solely for backward compatibility. + // These methods are the only way to do equivalent of (Add/Set)(Input/Output)<Message> + // but please consider using (Add/Set)(Input/Output)<TConcreteMessage> + // (where TConcreteMessage is some descendant of Message) + // because they are faster and better (see https://st.yandex-team.ru/YT-6967) + TDerived& AddProtobufInput_VerySlow_Deprecated(const TRichYPath& path); + TDerived& AddProtobufOutput_VerySlow_Deprecated(const TRichYPath& path); +}; + +/// +/// @brief Base spec for all operations. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/operations_options +template <class TDerived> +struct TOperationSpecBase +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Limit on operation execution time. + /// + /// If operation doesn't finish in time it will be aborted. + FLUENT_FIELD_OPTION(TDuration, TimeLimit); + + /// @brief Title to be shown in web interface. + FLUENT_FIELD_OPTION(TString, Title); + + /// @brief Pool to be used for this operation. + FLUENT_FIELD_OPTION(TString, Pool); + + /// @brief Weight of operation. + /// + /// Coefficient defining how much resources operation gets relative to its siblings in the same pool. + FLUENT_FIELD_OPTION(double, Weight); + + /// @breif Pool tree list that operation will use. + FLUENT_OPTIONAL_VECTOR_FIELD_ENCAPSULATED(TString, PoolTree); + + /// How much resources can be consumed by operation. + FLUENT_FIELD_OPTION_ENCAPSULATED(TSchedulerResources, ResourceLimits); +}; + +/// +/// @brief Base spec for all operations with user jobs. +template <class TDerived> +struct TUserOperationSpecBase + : TOperationSpecBase<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// How many jobs can fail before operation is failed. + FLUENT_FIELD_OPTION(ui64, MaxFailedJobCount); + + /// On any unsuccessful job completion (i.e. abortion or failure) force the whole operation to fail. + FLUENT_FIELD_OPTION(bool, FailOnJobRestart); + + /// + /// @brief Table to save whole stderr of operation. + /// + /// @see https://clubs.at.yandex-team.ru/yt/1045 + FLUENT_FIELD_OPTION(TYPath, StderrTablePath); + + /// + /// @brief Table to save coredumps of operation. + /// + /// @see https://clubs.at.yandex-team.ru/yt/1045 + FLUENT_FIELD_OPTION(TYPath, CoreTablePath); + + /// + /// @brief How long should the scheduler wait for the job to be started on a node. + /// + /// When you run huge jobs that require preemption of all the other jobs on + /// a node, the default timeout might be insufficient and your job may be + /// aborted with 'waiting_timeout' reason. This is especially problematic + /// when you are setting 'FailOnJobRestart' option. + /// + /// @note The value must be between 10 seconds and 10 minutes. + FLUENT_FIELD_OPTION(TDuration, WaitingJobTimeout); +}; + +/// +/// @brief Class to provide information on intermediate mapreduce stream protobuf types. +/// +/// When using protobuf format it is important to know exact types of proto messages +/// that are used in input/output. +/// +/// Sometimes such messages cannot be derived from job class +/// i.e. when job class uses `NYT::TTableReader<::google::protobuf::Message>` +/// or `NYT::TTableWriter<::google::protobuf::Message>`. +/// +/// When using such jobs user can provide exact message type using this class. +/// +/// @note Only input/output that relate to intermediate tables can be hinted. +/// Input to map and output of reduce is derived from `AddInput`/`AddOutput`. +template <class TDerived> +struct TIntermediateTablesHintSpec +{ + /// Specify intermediate map output type. + template <class T> + TDerived& HintMapOutput(); + + /// Specify reduce combiner input. + template <class T> + TDerived& HintReduceCombinerInput(); + + /// Specify reduce combiner output. + template <class T> + TDerived& HintReduceCombinerOutput(); + + /// Specify reducer input. + template <class T> + TDerived& HintReduceInput(); + + /// + /// @brief Add output of map stage. + /// + /// Mapper output table #0 is always intermediate table that is going to be reduced later. + /// Rows that mapper write to tables #1, #2, ... are saved in MapOutput tables. + template <class T> + TDerived& AddMapOutput(const TRichYPath& path); + + TVector<TRichYPath> MapOutputs_; + + const TVector<TStructuredTablePath>& GetStructuredMapOutputs() const; + const TMaybe<TTableStructure>& GetIntermediateMapOutputDescription() const; + const TMaybe<TTableStructure>& GetIntermediateReduceCombinerInputDescription() const; + const TMaybe<TTableStructure>& GetIntermediateReduceCombinerOutputDescription() const; + const TMaybe<TTableStructure>& GetIntermediateReducerInputDescription() const; + +private: + TVector<TStructuredTablePath> StructuredMapOutputs_; + TMaybe<TTableStructure> IntermediateMapOutputDescription_; + TMaybe<TTableStructure> IntermediateReduceCombinerInputDescription_; + TMaybe<TTableStructure> IntermediateReduceCombinerOutputDescription_; + TMaybe<TTableStructure> IntermediateReducerInputDescription_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TAddLocalFileOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TAddLocalFileOptions; + /// @endcond + + /// + /// @brief Path by which job will see the uploaded file. + /// + /// Defaults to basename of the local path. + FLUENT_FIELD_OPTION(TString, PathInJob); + + /// + /// @brief MD5 checksum of uploaded file. + /// + /// If not specified it is computed by this library. + /// If this argument is provided, the user can some cpu and disk IO. + FLUENT_FIELD_OPTION(TString, MD5CheckSum); + + /// + /// @brief Do not put file into node cache + /// + /// @see NYT::TRichYPath::BypassArtifactCache + FLUENT_FIELD_OPTION(bool, BypassArtifactCache); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Binary to run job profiler on. +enum class EProfilingBinary +{ + /// Profile job proxy. + JobProxy /* "job_proxy" */, + + /// Profile user job. + UserJob /* "user_job" */, +}; + +/// @brief Type of job profiler. +enum class EProfilerType +{ + /// Profile CPU usage. + Cpu /* "cpu" */, + + /// Profile memory usage. + Memory /* "memory" */, + + /// Profiler peak memory usage. + PeakMemory /* "peak_memory" */, +}; + +/// @brief Specifies a job profiler. +struct TJobProfilerSpec +{ + /// @cond Doxygen_Suppress + using TSelf = TJobProfilerSpec; + /// @endcond + + /// @brief Binary to profile. + FLUENT_FIELD_OPTION(EProfilingBinary, ProfilingBinary); + + /// @brief Type of the profiler. + FLUENT_FIELD_OPTION(EProfilerType, ProfilerType); + + /// @brief Probabiliy of the job being selected for profiling. + FLUENT_FIELD_OPTION(double, ProfilingProbability); + + /// @brief For sampling profilers, sets the number of samples per second. + FLUENT_FIELD_OPTION(int, SamplingFrequency); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of user job. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/operations_options#user_script_options +struct TUserJobSpec +{ + /// @cond Doxygen_Suppress + using TSelf = TUserJobSpec; + /// @endcond + + /// + /// @brief Specify a local file to upload to Cypress and prepare for use in job. + TSelf& AddLocalFile(const TLocalFilePath& path, const TAddLocalFileOptions& options = TAddLocalFileOptions()); + + /// + /// @brief Get the list of all added local files. + TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> GetLocalFiles() const; + + /// @brief Paths to files in Cypress to use in job. + FLUENT_VECTOR_FIELD(TRichYPath, File); + + /// + /// @brief MemoryLimit specifies how much memory job process can use. + /// + /// @note + /// If job uses tmpfs (check @ref NYT::TOperationOptions::MountSandboxInTmpfs) + /// YT computes its memory usage as total of: + /// - memory usage of job process itself (including mapped files); + /// - total size of tmpfs used by this job. + /// + /// @note + /// When @ref NYT::TOperationOptions::MountSandboxInTmpfs is enabled library will compute + /// total size of all files used by this job and add this total size to MemoryLimit. + /// Thus you shouldn't include size of your files (e.g. binary file) into MemoryLimit. + /// + /// @note + /// Final memory memory_limit passed to YT is calculated as follows: + /// + /// @note + /// ``` + /// memory_limit = MemoryLimit + <total-size-of-used-files> + ExtraTmpfsSize + /// ``` + /// + /// @see NYT::TUserJobSpec::ExtraTmpfsSize + FLUENT_FIELD_OPTION(i64, MemoryLimit); + + /// + /// @brief Size of data that is going to be written to tmpfs. + /// + /// This option should be used if job writes data to tmpfs. + /// + /// ExtraTmpfsSize should not include size of files specified with + /// @ref NYT::TUserJobSpec::AddLocalFile or @ref NYT::TUserJobSpec::AddFile + /// These files are copied to tmpfs automatically and their total size + /// is computed automatically. + /// + /// @see NYT::TOperationOptions::MountSandboxInTmpfs + /// @see NYT::TUserJobSpec::MemoryLimit + FLUENT_FIELD_OPTION(i64, ExtraTmpfsSize); + + /// + /// @brief Maximum number of CPU cores for a single job to use. + FLUENT_FIELD_OPTION(double, CpuLimit); + + /// + /// @brief Fraction of @ref NYT::TUserJobSpec::MemoryLimit that job gets at start. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/operations_options#memory_reserve_factor + FLUENT_FIELD_OPTION(double, MemoryReserveFactor); + + /// + /// @brief Local path to executable to be used inside jobs. + //// + /// Provided executable must use C++ YT API library (this library) + /// and implement job class that is going to be used. + /// + /// This option might be useful if we want to start operation from nonlinux machines + /// (in that case we use `JobBinary` to provide path to the same program compiled for linux). + /// Other example of using this option is uploading executable to cypress in advance + /// and save the time required to upload current executable to cache. + /// `md5` argument can be used to save cpu time and disk IO when binary MD5 checksum is known. + /// When argument is not provided library will compute it itself. + TUserJobSpec& JobBinaryLocalPath(TString path, TMaybe<TString> md5 = Nothing()); + + /// + /// @brief Cypress path to executable to be used inside jobs. + TUserJobSpec& JobBinaryCypressPath(TString path, TMaybe<TTransactionId> transactionId = Nothing()); + + /// + /// @brief String that will be prepended to the command. + /// + /// This option overrides @ref NYT::TOperationOptions::JobCommandPrefix. + FLUENT_FIELD(TString, JobCommandPrefix); + + /// + /// @brief String that will be appended to the command. + /// + /// This option overrides @ref NYT::TOperationOptions::JobCommandSuffix. + FLUENT_FIELD(TString, JobCommandSuffix); + + /// + /// @brief Map of environment variables that will be set for jobs. + FLUENT_MAP_FIELD(TString, TString, Environment); + + /// + /// @brief Limit for all files inside job sandbox (in bytes). + FLUENT_FIELD_OPTION(ui64, DiskSpaceLimit); + + /// + /// @brief Number of ports reserved for the job (passed through environment in YT_PORT_0, YT_PORT_1, ...). + FLUENT_FIELD_OPTION(ui16, PortCount); + + /// + /// @brief Network project used to isolate job network. + FLUENT_FIELD_OPTION(TString, NetworkProject); + + /// + /// @brief Limit on job execution time. + /// + /// Jobs that exceed this limit will be considered failed. + FLUENT_FIELD_OPTION(TDuration, JobTimeLimit); + + /// + /// @brief Get job binary config. + const TJobBinaryConfig& GetJobBinary() const; + + /// + /// @brief List of profilers to run. + FLUENT_VECTOR_FIELD(TJobProfilerSpec, JobProfiler); + +private: + TVector<std::tuple<TLocalFilePath, TAddLocalFileOptions>> LocalFiles_; + TJobBinaryConfig JobBinary_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of Map operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/map +template <typename TDerived> +struct TMapOperationSpecBase + : public TUserOperationSpecBase<TDerived> + , public TWithAutoMergeSpec<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of mapper job. + FLUENT_FIELD(TUserJobSpec, MapperSpec); + + /// + /// @brief Whether to guarantee the order of rows passed to mapper matches the order in the table. + /// + /// When `Ordered' is false (by default), there is no guaranties about order of reading rows. + /// In this case mapper might work slightly faster because row delivered from fast node can be processed YT waits + /// response from slow nodes. + /// When `Ordered' is true, rows will come in order in which they are stored in input tables. + FLUENT_FIELD_OPTION(bool, Ordered); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TMapOperationSpecBase::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TMapOperationSpecBase::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); +}; + +/// +/// @brief Spec of Map operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/map +struct TMapOperationSpec + : public TMapOperationSpecBase<TMapOperationSpec> + , public TOperationIOSpec<TMapOperationSpec> + , public TUserJobFormatHintsBase<TMapOperationSpec> +{ }; + +/// +/// @brief Spec of raw Map operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/map +struct TRawMapOperationSpec + : public TMapOperationSpecBase<TRawMapOperationSpec> + , public TSimpleRawOperationIoSpec<TRawMapOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of Reduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce +template <typename TDerived> +struct TReduceOperationSpecBase + : public TUserOperationSpecBase<TDerived> + , public TWithAutoMergeSpec<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of reduce job. + FLUENT_FIELD(TUserJobSpec, ReducerSpec); + + /// + /// @brief Columns to sort rows by (must include `ReduceBy` as prefix). + FLUENT_FIELD(TSortColumns, SortBy); + + /// + /// @brief Columns to group rows by. + FLUENT_FIELD(TSortColumns, ReduceBy); + + /// + /// @brief Columns to join foreign tables by (must be prefix of `ReduceBy`). + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables + FLUENT_FIELD_OPTION(TSortColumns, JoinBy); + + /// + /// @brief Guarantee to feed all rows with same `ReduceBy` columns to a single job (`true` by default). + FLUENT_FIELD_OPTION(bool, EnableKeyGuarantee); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TReduceOperationSpecBase::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TReduceOperationSpecBase::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); +}; + +/// +/// @brief Spec of Reduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce +struct TReduceOperationSpec + : public TReduceOperationSpecBase<TReduceOperationSpec> + , public TOperationIOSpec<TReduceOperationSpec> + , public TUserJobFormatHintsBase<TReduceOperationSpec> +{ }; + +/// +/// @brief Spec of raw Reduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce +struct TRawReduceOperationSpec + : public TReduceOperationSpecBase<TRawReduceOperationSpec> + , public TSimpleRawOperationIoSpec<TRawReduceOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of JoinReduce operation. +/// +/// @deprecated Instead the user should run a reduce operation +/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables +template <typename TDerived> +struct TJoinReduceOperationSpecBase + : public TUserOperationSpecBase<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of reduce job. + FLUENT_FIELD(TUserJobSpec, ReducerSpec); + + /// + /// @brief Columns to join foreign tables by (must be prefix of `ReduceBy`). + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables + FLUENT_FIELD(TSortColumns, JoinBy); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TJoinReduceOperationSpecBase::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TJoinReduceOperationSpecBase::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); +}; + +/// +/// @brief Spec of JoinReduce operation. +/// +/// @deprecated Instead the user should run a reduce operation +/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables +struct TJoinReduceOperationSpec + : public TJoinReduceOperationSpecBase<TJoinReduceOperationSpec> + , public TOperationIOSpec<TJoinReduceOperationSpec> + , public TUserJobFormatHintsBase<TJoinReduceOperationSpec> +{ }; + +/// +/// @brief Spec of raw JoinReduce operation. +/// +/// @deprecated Instead the user should run a reduce operation +/// with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false`. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/reduce#foreign_tables +struct TRawJoinReduceOperationSpec + : public TJoinReduceOperationSpecBase<TRawJoinReduceOperationSpec> + , public TSimpleRawOperationIoSpec<TRawJoinReduceOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Spec of MapReduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce +template <typename TDerived> +struct TMapReduceOperationSpecBase + : public TUserOperationSpecBase<TDerived> +{ + /// @cond Doxygen_Suppress + using TSelf = TDerived; + /// @endcond + + /// + /// @brief Spec of map job. + FLUENT_FIELD(TUserJobSpec, MapperSpec); + + /// + /// @brief Spec of reduce job. + FLUENT_FIELD(TUserJobSpec, ReducerSpec); + + /// + /// @brief Spec of reduce combiner. + FLUENT_FIELD(TUserJobSpec, ReduceCombinerSpec); + + /// + /// @brief Columns to sort rows by (must include `ReduceBy` as prefix). + FLUENT_FIELD(TSortColumns, SortBy); + + /// + /// @brief Columns to group rows by. + FLUENT_FIELD(TSortColumns, ReduceBy); + + /// + /// @brief Recommended number of map jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TMapReduceOperationSpecBase::DataSizePerMapJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, MapJobCount); + + /// + /// @brief Recommended of data size for each map job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TMapReduceOperationSpecBase::MapJobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerMapJob); + + /// + /// @brief Recommended number of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionCount); + + /// + /// @brief Recommended size of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionDataSize); + + /// + /// @brief Account to use for intermediate data. + FLUENT_FIELD_OPTION(TString, IntermediateDataAccount); + + /// + /// @brief Replication factor for intermediate data (1 by default). + FLUENT_FIELD_OPTION(ui64, IntermediateDataReplicationFactor); + + /// + /// @brief Recommended size of data to be passed to a single reduce combiner. + FLUENT_FIELD_OPTION(ui64, DataSizePerSortJob); + + /// + /// @brief Whether to guarantee the order of rows passed to mapper matches the order in the table. + /// + /// @see @ref NYT::TMapOperationSpec::Ordered for more info. + FLUENT_FIELD_OPTION(bool, Ordered); + + /// + /// @brief Guarantee to run reduce combiner before reducer. + FLUENT_FIELD_OPTION(bool, ForceReduceCombiners); +}; + +/// +/// @brief Spec of MapReduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce +struct TMapReduceOperationSpec + : public TMapReduceOperationSpecBase<TMapReduceOperationSpec> + , public TOperationIOSpec<TMapReduceOperationSpec> + , public TIntermediateTablesHintSpec<TMapReduceOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TMapReduceOperationSpec; + /// @endcond + + /// + /// @brief Format hints for mapper. + FLUENT_FIELD_DEFAULT(TUserJobFormatHints, MapperFormatHints, TUserJobFormatHints()); + + /// + /// @brief Format hints for reducer. + FLUENT_FIELD_DEFAULT(TUserJobFormatHints, ReducerFormatHints, TUserJobFormatHints()); + + /// + /// @brief Format hints for reduce combiner. + FLUENT_FIELD_DEFAULT(TUserJobFormatHints, ReduceCombinerFormatHints, TUserJobFormatHints()); +}; + +/// +/// @brief Spec of raw MapReduce operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce +struct TRawMapReduceOperationSpec + : public TMapReduceOperationSpecBase<TRawMapReduceOperationSpec> + , public TRawMapReduceOperationIoSpec<TRawMapReduceOperationSpec> +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Schema inference mode. +/// +/// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference +enum class ESchemaInferenceMode : int +{ + FromInput /* "from_input" */, + FromOutput /* "from_output" */, + Auto /* "auto" */, +}; + +/// +/// @brief Spec of Sort operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/sort +struct TSortOperationSpec + : TOperationSpecBase<TSortOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TSortOperationSpec; + /// @endcond + + /// + /// @brief Paths to input tables. + FLUENT_VECTOR_FIELD(TRichYPath, Input); + + /// + /// @brief Path to output table. + FLUENT_FIELD(TRichYPath, Output); + + /// + /// @brief Columns to sort table by. + FLUENT_FIELD(TSortColumns, SortBy); + + /// + /// @brief Recommended number of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionCount); + + /// + /// @brief Recommended size of intermediate data partitions. + FLUENT_FIELD_OPTION(ui64, PartitionDataSize); + + /// + /// @brief Recommended number of partition jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TSortOperationSpec::DataSizePerPartitionJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, PartitionJobCount); + + /// + /// @brief Recommended of data size for each partition job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TSortOperationSpec::PartitionJobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerPartitionJob); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); + + /// + /// @brief Account to use for intermediate data. + FLUENT_FIELD_OPTION(TString, IntermediateDataAccount); + + /// + /// @brief Replication factor for intermediate data (1 by default). + FLUENT_FIELD_OPTION(ui64, IntermediateDataReplicationFactor); +}; + + +/// +/// @brief Merge mode. +enum EMergeMode : int +{ + MM_UNORDERED /* "unordered" */, + MM_ORDERED /* "ordered" */, + MM_SORTED /* "sorted" */, +}; + +/// +/// @brief Spec of Merge operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/merge +struct TMergeOperationSpec + : TOperationSpecBase<TMergeOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TMergeOperationSpec; + /// @endcond + + /// + /// @brief Paths to input tables. + FLUENT_VECTOR_FIELD(TRichYPath, Input); + + /// + /// @brief Path to output table. + FLUENT_FIELD(TRichYPath, Output); + + /// + /// @brief Columns by which to merge (for @ref NYT::EMergeMode::MM_SORTED). + FLUENT_FIELD(TSortColumns, MergeBy); + + /// + /// @brief Merge mode. + FLUENT_FIELD_DEFAULT(EMergeMode, Mode, MM_UNORDERED); + + /// + /// @brief Combine output chunks to larger ones. + FLUENT_FIELD_DEFAULT(bool, CombineChunks, false); + + /// + /// @brief Guarantee that all input chunks will be read. + FLUENT_FIELD_DEFAULT(bool, ForceTransform, false); + + /// + /// @brief Recommended number of jobs to run. + /// + /// `JobCount' has higher priority than @ref NYT::TMergeOperationSpec::DataSizePerJob. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui32, JobCount); + + /// + /// @brief Recommended of data size for each job. + /// + /// `DataSizePerJob` has lower priority that @ref NYT::TMergeOperationSpec::JobCount. + /// This option only provide a recommendation and may be ignored if conflicting with YT internal limits. + FLUENT_FIELD_OPTION(ui64, DataSizePerJob); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); +}; + +/// +/// @brief Spec of Erase operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/erase +struct TEraseOperationSpec + : TOperationSpecBase<TEraseOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TEraseOperationSpec; + /// @endcond + + /// + /// @brief Which table (or row range) to erase. + FLUENT_FIELD(TRichYPath, TablePath); + + /// + /// Combine output chunks to larger ones. + FLUENT_FIELD_DEFAULT(bool, CombineChunks, false); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); +}; + +/// +/// @brief Spec of RemoteCopy operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/remote_copy +struct TRemoteCopyOperationSpec + : TOperationSpecBase<TRemoteCopyOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TRemoteCopyOperationSpec; + /// @endcond + + /// + /// @brief Source cluster name. + FLUENT_FIELD(TString, ClusterName); + + /// + /// @brief Network to use for copy (all remote cluster nodes must have it configured). + FLUENT_FIELD_OPTION(TString, NetworkName); + + /// + /// @brief Paths to input tables. + FLUENT_VECTOR_FIELD(TRichYPath, Input); + + /// + /// @brief Path to output table. + FLUENT_FIELD(TRichYPath, Output); + + /// + /// @brief Inference mode for output table schema. + /// + /// @see https://yt.yandex-team.ru/docs/description/storage/static_schema.html#schema_inference + FLUENT_FIELD_OPTION(ESchemaInferenceMode, SchemaInferenceMode); + + /// + /// @brief Copy user attributes from input to output table (allowed only for single input table). + FLUENT_FIELD_DEFAULT(bool, CopyAttributes, false); + + /// + /// @brief Names of user attributes to copy from input to output table. + /// + /// @note To make this option make sense set @ref NYT::TRemoteCopyOperationSpec::CopyAttributes to `true`. + FLUENT_VECTOR_FIELD(TString, AttributeKey); + +private: + + /// + /// @brief Config for remote cluster connection. + FLUENT_FIELD_OPTION(TNode, ClusterConnection); +}; + +class IVanillaJobBase; + +/// +/// @brief Task of Vanilla operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla +struct TVanillaTask + : public TOperationOutputSpecBase + , public TUserJobOutputFormatHintsBase<TVanillaTask> +{ + /// @cond Doxygen_Suppress + using TSelf = TVanillaTask; + /// @endcond + + /// + /// @brief Add output table path and specify the task output type (i.e. TMyProtoMessage). + template <class T> + TSelf& AddOutput(const TRichYPath& path); + + /// + /// @brief Add output table path as structured path. + TSelf& AddStructuredOutput(TStructuredTablePath path); + + /// + /// @brief Set output table path and specify the task output type (i.e. TMyProtoMessage). + template <class T> + TSelf& SetOutput(size_t tableIndex, const TRichYPath& path); + + /// + /// @brief Task name. + FLUENT_FIELD(TString, Name); + + /// + /// @brief Job to be executed in this task. + FLUENT_FIELD(::TIntrusivePtr<IVanillaJobBase>, Job); + + /// + /// @brief User job spec. + FLUENT_FIELD(TUserJobSpec, Spec); + + /// + /// @brief Number of jobs to run and wait for successful completion. + /// + /// @note If @ref NYT::TUserOperationSpecBase::FailOnJobRestart is `false`, a failed job will be restarted + /// and will not count in this amount. + FLUENT_FIELD(ui64, JobCount); + + /// + /// @brief Network project name. + FLUENT_FIELD(TMaybe<TString>, NetworkProject); + +}; + +/// +/// @brief Spec of Vanilla operation. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla +struct TVanillaOperationSpec + : TUserOperationSpecBase<TVanillaOperationSpec> +{ + /// @cond Doxygen_Suppress + using TSelf = TVanillaOperationSpec; + /// @endcond + + /// + /// @brief Description of tasks to run in this operation. + FLUENT_VECTOR_FIELD(TVanillaTask, Task); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Options for @ref NYT::IOperationClient::Map and other operation start commands. +struct TOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TOperationOptions; + /// @endcond + + /// + /// @brief Additional field to put to operation spec. + FLUENT_FIELD_OPTION(TNode, Spec); + + /// + /// @brief Start operation mode. + enum class EStartOperationMode : int + { + /// + /// @brief Prepare operation asynchronously. Call IOperation::Start() to start operation. + AsyncPrepare, + + /// + /// @brief Prepare and start operation asynchronously. Don't wait for operation completion. + AsyncStart, + + /// + /// @brief Prepare and start operation synchronously. Don't wait for operation completion. + SyncStart, + + /// + /// @brief Prepare, start and wait for operation completion synchronously. + SyncWait, + }; + + /// + /// @brief Start operation mode. + FLUENT_FIELD_DEFAULT(EStartOperationMode, StartOperationMode, EStartOperationMode::SyncWait); + + /// + /// @brief Wait for operation finish synchronously. + /// + /// @deprecated Use StartOperationMode() instead. + TSelf& Wait(bool value) { + StartOperationMode_ = value ? EStartOperationMode::SyncWait : EStartOperationMode::SyncStart; + return static_cast<TSelf&>(*this); + } + + /// + /// + /// @brief Use format from table attribute (for YAMR-like format). + /// + /// @deprecated + FLUENT_FIELD_DEFAULT(bool, UseTableFormats, false); + + /// + /// @brief Prefix for bash command running the jobs. + /// + /// Can be overridden for the specific job type in the @ref NYT::TUserJobSpec. + FLUENT_FIELD(TString, JobCommandPrefix); + + /// + /// @brief Suffix for bash command running the jobs. + /// + /// Can be overridden for the specific job type in the @ref NYT::TUserJobSpec. + FLUENT_FIELD(TString, JobCommandSuffix); + + /// + /// @brief Put all files required by the job into tmpfs. + /// + /// This option can be set globally using @ref NYT::TConfig::MountSandboxInTmpfs. + /// @see https://yt.yandex-team.ru/docs/problems/woodpeckers + FLUENT_FIELD_DEFAULT(bool, MountSandboxInTmpfs, false); + + /// + /// @brief Path to directory to store temporary files. + FLUENT_FIELD_OPTION(TString, FileStorage); + + /// + /// @brief Expiration timeout for uploaded files. + FLUENT_FIELD_OPTION(TDuration, FileExpirationTimeout); + + /// + /// @brief Info to be passed securely to the job. + FLUENT_FIELD_OPTION(TNode, SecureVault); + + /// + /// @brief File cache mode. + enum class EFileCacheMode : int + { + /// + /// @brief Use YT API commands "get_file_from_cache" and "put_file_to_cache". + ApiCommandBased, + + /// + /// @brief Upload files to random paths inside @ref NYT::TOperationOptions::FileStorage without caching. + CachelessRandomPathUpload, + }; + + /// + /// @brief File cache mode. + FLUENT_FIELD_DEFAULT(EFileCacheMode, FileCacheMode, EFileCacheMode::ApiCommandBased); + + /// + /// @brief Id of transaction within which all Cypress file storage entries will be checked/created. + /// + /// By default, the root transaction is used. + /// + /// @note Set a specific transaction only if you + /// 1. specify non-default file storage path in @ref NYT::TOperationOptions::FileStorage or in @ref NYT::TConfig::RemoteTempFilesDirectory. + /// 2. use `CachelessRandomPathUpload` caching mode (@ref NYT::TOperationOptions::FileCacheMode). + FLUENT_FIELD(TTransactionId, FileStorageTransactionId); + + /// + /// @brief Ensure stderr and core tables exist before starting operation. + /// + /// If set to `false`, it is user's responsibility to ensure these tables exist. + FLUENT_FIELD_DEFAULT(bool, CreateDebugOutputTables, true); + + /// + /// @brief Ensure output tables exist before starting operation. + /// + /// If set to `false`, it is user's responsibility to ensure output tables exist. + FLUENT_FIELD_DEFAULT(bool, CreateOutputTables, true); + + /// + /// @brief Try to infer schema of inexistent table from the type of written rows. + /// + /// @note Default values for this option may differ depending on the row type. + /// For protobuf it's currently `false` by default. + FLUENT_FIELD_OPTION(bool, InferOutputSchema); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Get operation secure vault (specified in @ref NYT::TOperationOptions::SecureVault) inside a job. +const TNode& GetJobSecureVault(); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Context passed to @ref NYT::IRawJob::Do. +class TRawJobContext +{ +public: + explicit TRawJobContext(size_t outputTableCount); + + /// + /// @brief Get file corresponding to input stream. + const TFile& GetInputFile() const; + + /// + /// @brief Get files corresponding to output streams. + const TVector<TFile>& GetOutputFileList() const; + +private: + TFile InputFile_; + TVector<TFile> OutputFileList_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for classes that can be Saved/Loaded (to be used with @ref Y_SAVELOAD_JOB). +class ISerializableForJob +{ +public: + virtual ~ISerializableForJob() = default; + + /// + /// @brief Dump state to output stream to be restored in job. + virtual void Save(IOutputStream& stream) const = 0; + + /// + /// @brief Load state from a stream. + virtual void Load(IInputStream& stream) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Provider of information about operation inputs/outputs during @ref NYT::IJob::PrepareOperation. +class IOperationPreparationContext +{ +public: + virtual ~IOperationPreparationContext() = default; + + /// @brief Get the number of input tables. + virtual int GetInputCount() const = 0; + + /// @brief Get the number of output tables. + virtual int GetOutputCount() const = 0; + + /// @brief Get the schema of input table no. `index`. + virtual const TTableSchema& GetInputSchema(int index) const = 0; + + /// @brief Get all the input table schemas. + virtual const TVector<TTableSchema>& GetInputSchemas() const = 0; + + /// @brief Path to the input table if available (`Nothing()` for intermediate tables). + virtual TMaybe<TYPath> GetInputPath(int index) const = 0; + + /// @brief Path to the output table if available (`Nothing()` for intermediate tables). + virtual TMaybe<TYPath> GetOutputPath(int index) const = 0; +}; + +/// +/// @brief Fluent builder class for @ref NYT::IJob::PrepareOperation. +/// +/// @note Method calls are supposed to be chained. +class TJobOperationPreparer +{ +public: + + /// + /// @brief Group of input tables that allows to specify properties on all of them at once. + /// + /// The instances are created with @ref NYT::TJobOperationPreparer::BeginInputGroup, not directly. + class TInputGroup + { + public: + TInputGroup(TJobOperationPreparer& preparer, TVector<int> indices); + + /// @brief Specify the type of input rows. + template <typename TRow> + TInputGroup& Description(); + + /// @brief Specify renaming of input columns. + TInputGroup& ColumnRenaming(const THashMap<TString, TString>& renaming); + + /// @brief Specify what input columns to send to job + /// + /// @note Filter is applied before renaming, so it must specify original column names. + TInputGroup& ColumnFilter(const TVector<TString>& columns); + + /// @brief Finish describing the input group. + TJobOperationPreparer& EndInputGroup(); + + private: + TJobOperationPreparer& Preparer_; + TVector<int> Indices_; + }; + + /// + /// @brief Group of output tables that allows to specify properties on all of them at once. + /// + /// The instances are created with @ref NYT::TJobOperationPreparer::BeginOutputGroup, not directly. + class TOutputGroup + { + public: + TOutputGroup(TJobOperationPreparer& preparer, TVector<int> indices); + + /// @brief Specify the type of output rows. + /// + /// @tparam TRow type of output rows from tables of this group. + /// @param inferSchema Infer schema from `TRow` and specify it for these output tables. + template <typename TRow> + TOutputGroup& Description(bool inferSchema = true); + + /// @brief Specify schema for these tables. + TOutputGroup& Schema(const TTableSchema& schema); + + /// @brief Specify that all the the tables in this group are unschematized. + /// + /// It is equivalent of `.Schema(TTableSchema().Strict(false)`. + TOutputGroup& NoSchema(); + + /// @brief Finish describing the output group. + TJobOperationPreparer& EndOutputGroup(); + + private: + TJobOperationPreparer& Preparer_; + TVector<int> Indices_; + }; + +public: + explicit TJobOperationPreparer(const IOperationPreparationContext& context); + + /// @brief Begin input group consisting of tables with indices `[begin, end)`. + /// + /// @param begin First index. + /// @param end Index after the last one. + TInputGroup BeginInputGroup(int begin, int end); + + /// @brief Begin input group consisting of tables with indices from `indices`. + /// + /// @tparam TCont Container with integers. Must support `std::begin` and `std::end` functions. + /// @param indices Indices of tables to include in the group. + template <typename TCont> + TInputGroup BeginInputGroup(const TCont& indices); + + /// @brief Begin output group consisting of tables with indices `[begin, end)`. + /// + /// @param begin First index. + /// @param end Index after the last one. + TOutputGroup BeginOutputGroup(int begin, int end); + + /// @brief Begin input group consisting of tables with indices from `indices`. + /// + /// @tparam TCont Container with integers. Must support `std::begin` and `std::end` functions. + /// @param indices Indices of tables to include in the group. + template <typename TCont> + TOutputGroup BeginOutputGroup(const TCont& indices); + + /// @brief Specify the schema for output table no `tableIndex`. + /// + /// @note All the output schemas must be specified either with this method, `NoOutputSchema` or `OutputDescription` with `inferSchema == true` + TJobOperationPreparer& OutputSchema(int tableIndex, TTableSchema schema); + + /// @brief Mark the output table no. `tableIndex` as unschematized. + TJobOperationPreparer& NoOutputSchema(int tableIndex); + + /// @brief Specify renaming of input columns for table no. `tableIndex`. + TJobOperationPreparer& InputColumnRenaming(int tableIndex, const THashMap<TString, TString>& renaming); + + /// @brief Specify what input columns of table no. `tableIndex` to send to job + /// + /// @note Filter is applied before renaming, so it must specify original column names. + TJobOperationPreparer& InputColumnFilter(int tableIndex, const TVector<TString>& columns); + + /// @brief Specify the type of input rows for table no. `tableIndex`. + /// + /// @tparam TRow type of input rows. + template <typename TRow> + TJobOperationPreparer& InputDescription(int tableIndex); + + /// @brief Specify the type of output rows for table no. `tableIndex`. + /// + /// @tparam TRow type of output rows. + /// @param inferSchema Infer schema from `TRow` and specify it for the output tables. + template <typename TRow> + TJobOperationPreparer& OutputDescription(int tableIndex, bool inferSchema = true); + + /// @brief Set type of output rows for table no. `tableIndex` to TNode + /// + /// @note Set schema via `OutputSchema` if needed + TJobOperationPreparer& NodeOutput(int tableIndex); + + /// @brief Specify input format hints. + /// + /// These hints have lower priority than ones specified in spec. + TJobOperationPreparer& InputFormatHints(TFormatHints hints); + + /// @brief Specify output format hints. + /// + /// These hints have lower priority than ones specified in spec. + TJobOperationPreparer& OutputFormatHints(TFormatHints hints); + + /// @brief Specify format hints. + /// + /// These hints have lower priority than ones specified in spec. + TJobOperationPreparer& FormatHints(TUserJobFormatHints newFormatHints); + + /// @name "Private" members + /// The following methods should not be used by clients in @ref NYT::IJob::PrepareOperation + ///@{ + + /// @brief Finish the building process. + void Finish(); + + /// @brief Get output table schemas as specified by the user. + TVector<TTableSchema> GetOutputSchemas(); + + /// @brief Get input column renamings as specified by the user. + const TVector<THashMap<TString, TString>>& GetInputColumnRenamings() const; + + /// @brief Get input column filters as specified by the user. + const TVector<TMaybe<TVector<TString>>>& GetInputColumnFilters() const; + + /// @brief Get input column descriptions as specified by the user. + const TVector<TMaybe<TTableStructure>>& GetInputDescriptions() const; + + /// @brief Get output column descriptions as specified by the user. + const TVector<TMaybe<TTableStructure>>& GetOutputDescriptions() const; + + /// @brief Get format hints as specified by the user. + const TUserJobFormatHints& GetFormatHints() const; + + ///@} +private: + + /// @brief Validate that schema for output table no. `tableIndex` has not been set yet. + void ValidateMissingOutputSchema(int tableIndex) const; + + /// @brief Validate that description for input table no. `tableIndex` has not been set yet. + void ValidateMissingInputDescription(int tableIndex) const; + + /// @brief Validate that description for output table no. `tableIndex` has not been set yet. + void ValidateMissingOutputDescription(int tableIndex) const; + + /// @brief Validate that `tableIndex` is in correct range for input table indices. + /// + /// @param message Message to add to the exception in case of violation. + void ValidateInputTableIndex(int tableIndex, TStringBuf message) const; + + /// @brief Validate that `tableIndex` is in correct range for output table indices. + /// + /// @param message Message to add to the exception in case of violation. + void ValidateOutputTableIndex(int tableIndex, TStringBuf message) const; + + /// @brief Validate that all the output schemas has been set. + void FinallyValidate() const; + + static TTableSchema EmptyNonstrictSchema(); + +private: + const IOperationPreparationContext& Context_; + + TVector<TMaybe<TTableSchema>> OutputSchemas_; + TVector<THashMap<TString, TString>> InputColumnRenamings_; + TVector<TMaybe<TVector<TString>>> InputColumnFilters_; + TVector<TMaybe<TTableStructure>> InputTableDescriptions_; + TVector<TMaybe<TTableStructure>> OutputTableDescriptions_; + TUserJobFormatHints FormatHints_ = {}; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for all user jobs. +class IJob + : public TThrRefBase +{ +public: + + /// + /// @brief Type of job. + enum EType + { + Mapper, + Reducer, + ReducerAggregator, + RawJob, + VanillaJob, + }; + + /// + /// @brief Save job state to stream to be restored on cluster nodes. + virtual void Save(IOutputStream& stream) const + { + Y_UNUSED(stream); + } + + /// + /// @brief Restore job state from a stream. + virtual void Load(IInputStream& stream) + { + Y_UNUSED(stream); + } + + /// + /// @brief Get operation secure vault (specified in @ref NYT::TOperationOptions::SecureVault) inside a job. + const TNode& SecureVault() const + { + return GetJobSecureVault(); + } + + /// + /// @brief Get number of output tables. + i64 GetOutputTableCount() const + { + Y_VERIFY(NDetail::OutputTableCount > 0); + + return NDetail::OutputTableCount; + } + + /// + /// @brief Method allowing user to control some properties of input and output tables and formats. + /// + /// User can override this method in their job class to: + /// - specify output table schemas. + /// The most natural way is usually through @ref NYT::TJobOperationPreparer::OutputDescription (especially for protobuf), + /// but you can use @ref NYT::TJobOperationPreparer::OutputSchema directly + /// - specify output row type (@ref NYT::TJobOperationPreparer::OutputDescription) + /// - specify input row type (@ref NYT::TJobOperationPreparer::InputDescription) + /// - specify input column filter and renaming (@ref NYT::TJobOperationPreparer::InputColumnFilter and @ref NYT::TJobOperationPreparer::InputColumnRenaming) + /// - specify format hints (@ref NYT::TJobOperationPreparer::InputFormatHints, + /// NYT::TJobOperationPreparer::OutputFormatHints and @ref NYT::TJobOperationPreparer::FormatHints) + /// - maybe something more, cf. the methods of @ref NYT::TJobOperationPreparer. + /// + /// If one has several similar tables, groups can be used. + /// Groups are delimited by @ref NYT::TJobOperationPreparer::BeginInputGroup / + /// @ref NYT::TJobOperationPreparer::TInputGroup::EndInputGroup and + /// @ref NYT::TJobOperationPreparer::BeginOutputGroup / + /// @ref NYT::TJobOperationPreparer::TOutputGroup::EndOutputGroup. + /// Example: + /// @code{.cpp} + /// preparer + /// .BeginInputGroup({1,2,4,8}) + /// .ColumnRenaming({{"a", "b"}, {"c", "d"}}) + /// .ColumnFilter({"a", "c"}) + /// .EndInputGroup(); + /// @endcode + /// + /// @note All the output table schemas must be set + /// (possibly as empty nonstrict using @ref NYT::TJobOperationPreparer::NoOutputSchema or + /// @ref NYT::TJobOperationPreparer::TOutputGroup::NoSchema). + /// By default all the output table schemas are marked as empty nonstrict. + virtual void PrepareOperation(const IOperationPreparationContext& context, TJobOperationPreparer& preparer) const; +}; + +/// +/// @brief Declare what fields of currently declared job class to save and restore on cluster node. +#define Y_SAVELOAD_JOB(...) \ + virtual void Save(IOutputStream& stream) const override { Save(&stream); } \ + virtual void Load(IInputStream& stream) override { Load(&stream); } \ + Y_PASS_VA_ARGS(Y_SAVELOAD_DEFINE(__VA_ARGS__)) + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for jobs with typed inputs and outputs. +class IStructuredJob + : public IJob +{ +public: + /// + /// @brief This methods are called when creating table reader and writer for the job. + /// + /// Override them if you want to implement custom input logic. (e.g. addtitional bufferization) + virtual TRawTableReaderPtr CreateCustomRawJobReader(int fd) const; + virtual THolder<IProxyOutput> CreateCustomRawJobWriter(size_t outputTableCount) const; + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const = 0; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Create default raw job reader. +TRawTableReaderPtr CreateRawJobReader(int fd = 0); + +/// +/// @brief Create default raw job writer. +THolder<IProxyOutput> CreateRawJobWriter(size_t outputTableCount); + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface for structured (typed) map jobs. +class IMapperBase + : public IStructuredJob +{ }; + +/// +/// @brief Base interface for structured (typed) map jobs with given reader and writer. +template <class TR, class TW> +class IMapper + : public IMapperBase +{ +public: + using TReader = TR; + using TWriter = TW; + +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::Mapper; + + /// + /// @brief This method is called before feeding input rows to mapper (before `Do` method). + virtual void Start(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief This method is called exactly once for the whole job input. + /// + /// Read input rows from `reader` and write output ones to `writer`. + virtual void Do(TReader* reader, TWriter* writer) = 0; + + /// + /// @brief This method is called after feeding input rows to mapper (after `Do` method). + virtual void Finish(TWriter* writer) + { + Y_UNUSED(writer); + } + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface for structured (typed) reduce jobs. +/// +/// It is common base for @ref NYT::IReducer and @ref NYT::IAggregatorReducer. +class IReducerBase + : public IStructuredJob +{ }; + +/// +/// @brief Base interface for structured (typed) reduce jobs with given reader and writer. +template <class TR, class TW> +class IReducer + : public IReducerBase +{ +public: + using TReader = TR; + using TWriter = TW; + +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::Reducer; + +public: + + /// + /// @brief This method is called before feeding input rows to reducer (before `Do` method). + virtual void Start(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief This method is called exactly once for each range with same value of `ReduceBy` (or `JoinBy`) keys. + virtual void Do(TReader* reader, TWriter* writer) = 0; + + /// + /// @brief This method is called after feeding input rows to reducer (after `Do` method). + virtual void Finish(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief Refuse to process the remaining row ranges and finish the job (successfully). + void Break(); + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface of jobs used inside reduce operations. +/// +/// Unlike @ref NYT::IReducer jobs their `Do' method is called only once +/// and takes whole range of records split by key boundaries. +/// +/// Template argument `TR` must be @ref NYT::TTableRangesReader. +template <class TR, class TW> +class IAggregatorReducer + : public IReducerBase +{ +public: + using TReader = TR; + using TWriter = TW; + +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::ReducerAggregator; + +public: + /// + /// @brief This method is called before feeding input rows to reducer (before `Do` method). + virtual void Start(TWriter* writer) + { + Y_UNUSED(writer); + } + + /// + /// @brief This method is called exactly once for the whole job input. + virtual void Do(TReader* reader, TWriter* writer) = 0; + + /// + /// @brief This method is called after feeding input rows to reducer (after `Do` method). + virtual void Finish(TWriter* writer) + { + Y_UNUSED(writer); + } + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface for raw jobs (i.e. reading and writing byte streams). +class IRawJob + : public IJob +{ +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::RawJob; + + /// + /// @brief This method is called exactly once for the whole job input. + virtual void Do(const TRawJobContext& jobContext) = 0; +}; + +/// +/// @brief Interface of jobs that run the given bash command. +class ICommandJob + : public IJob +{ +public: + /// + /// @brief Get bash command to run. + /// + /// @note This method is called on the client side. + virtual const TString& GetCommand() const = 0; +}; + +/// +/// @brief Raw job executing given bash command. +/// +/// @note The binary will not be uploaded. +class TCommandRawJob + : public IRawJob + , public ICommandJob +{ +public: + /// + /// @brief Create job with specified command. + /// + /// @param command Bash command to run. + explicit TCommandRawJob(TStringBuf command = {}); + + const TString& GetCommand() const override; + void Do(const TRawJobContext& jobContext) override; + +private: + TString Command_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Base interface for vanilla jobs. +/// +/// @see https://yt.yandex-team.ru/docs/description/mr/vanilla +class IVanillaJobBase + : public virtual IStructuredJob +{ +public: + /// Type of job implemented by this class. + static constexpr EType JobType = EType::VanillaJob; +}; + +template <class TW = void> +class IVanillaJob; + +/// +/// @brief Interface of vanilla job without outputs. +template <> +class IVanillaJob<void> + : public IVanillaJobBase +{ +public: + /// + /// @brief This method is called exactly once for each vanilla job. + virtual void Do() = 0; + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +/// +/// @brief Vanilla job executing given bash command. +/// +/// @note The binary will not be uploaded. +class TCommandVanillaJob + : public IVanillaJob<> + , public ICommandJob +{ +public: + /// + /// @brief Create job with specified command. + /// + /// @param command Bash command to run. + explicit TCommandVanillaJob(TStringBuf command = {}); + + const TString& GetCommand() const override; + void Do() override; + +private: + TString Command_; +}; + +/// +/// @brief Interface for vanilla jobs with output tables. +template <class TW> +class IVanillaJob + : public IVanillaJobBase +{ +public: + using TWriter = TW; + +public: + /// + /// @brief This method is called before `Do` method. + virtual void Start(TWriter* /* writer */) + { } + + /// + /// @brief This method is called exactly once for each vanilla job. + /// + /// Write output rows to `writer`. + virtual void Do(TWriter* writer) = 0; + + /// + /// @brief This method is called after `Do` method. + virtual void Finish(TWriter* /* writer */) + { } + + virtual TStructuredRowStreamDescription GetInputRowStreamDescription() const override; + virtual TStructuredRowStreamDescription GetOutputRowStreamDescription() const override; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Attributes to request for an operation. +enum class EOperationAttribute : int +{ + Id /* "id" */, + Type /* "type" */, + State /* "state" */, + AuthenticatedUser /* "authenticated_user" */, + StartTime /* "start_time" */, + FinishTime /* "finish_time" */, + BriefProgress /* "brief_progress" */, + BriefSpec /* "brief_spec" */, + Suspended /* "suspended" */, + Result /* "result" */, + Progress /* "progress" */, + Events /* "events" */, + Spec /* "spec" */, + FullSpec /* "full_spec" */, + UnrecognizedSpec /* "unrecognized_spec" */, +}; + +/// +/// @brief Class describing which attributes to request in @ref NYT::IClient::GetOperation or @ref NYT::IClient::ListOperations. +struct TOperationAttributeFilter +{ + /// @cond Doxygen_Suppress + using TSelf = TOperationAttributeFilter; + /// @endcond + + TVector<EOperationAttribute> Attributes_; + + /// + /// @brief Add attribute to the filter. Calls are supposed to be chained. + TSelf& Add(EOperationAttribute attribute) + { + Attributes_.push_back(attribute); + return *this; + } +}; + +/// +/// @brief Options for @ref NYT::IClient::GetOperation call. +struct TGetOperationOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetOperationOptions; + /// @endcond + + /// + /// @brief What attributes to request (if omitted, the default set of attributes will be requested). + FLUENT_FIELD_OPTION(TOperationAttributeFilter, AttributeFilter); +}; + +/// +/// @brief "Coarse-grained" state of an operation. +enum class EOperationBriefState : int +{ + InProgress /* "in_progress" */, + Completed /* "completed" */, + Aborted /* "aborted" */, + + /// Failed + Failed /* "failed" */, +}; + +/// +/// @brief Operation type. +enum class EOperationType : int +{ + Map /* "map" */, + Merge /* "merge" */, + Erase /* "erase" */, + Sort /* "sort" */, + Reduce /* "reduce" */, + MapReduce /* "map_reduce" */, + RemoteCopy /* "remote_copy" */, + JoinReduce /* "join_reduce" */, + Vanilla /* "vanilla" */, +}; + +/// +/// @brief Operation progress. +struct TOperationProgress +{ + /// + /// @brief Total job statistics. + TJobStatistics JobStatistics; + + /// + /// @brief Job counter for various job states with hierarchy. + TJobCounters JobCounters; + + /// + /// @brief Time when this progress was built on scheduler or CA. + TMaybe<TInstant> BuildTime; +}; + +/// +/// @brief Brief operation progress (numbers of jobs in these states). +struct TOperationBriefProgress +{ + ui64 Aborted = 0; + ui64 Completed = 0; + ui64 Failed = 0; + ui64 Lost = 0; + ui64 Pending = 0; + ui64 Running = 0; + ui64 Total = 0; +}; + +/// +/// @brief Operation result. +struct TOperationResult +{ + /// + /// @brief For a unsuccessfully finished operation: description of error. + TMaybe<TYtError> Error; +}; + +/// +/// @brief Operation event (change of state). +struct TOperationEvent +{ + /// + /// @brief New state of operation. + TString State; + + /// + /// @brief Time of state change. + TInstant Time; +}; + +/// +/// @brief Operation info. +/// +/// A field may be `Nothing()` either if it was not requested (see @ref NYT::TGetOperationOptions::AttributeFilter) +/// or it is not available (i.e. `FinishTime` for a running operation). +/// @see https://yt.yandex-team.ru/docs/api/commands#get_operation +struct TOperationAttributes +{ + /// + /// @brief Operation id. + TMaybe<TOperationId> Id; + + /// + /// @brief Operation type. + TMaybe<EOperationType> Type; + + /// + /// @brief Operation state. + TMaybe<TString> State; + + /// + /// @brief "Coarse-grained" operation state. + TMaybe<EOperationBriefState> BriefState; + + /// + /// @brief Name of user that started the operation. + TMaybe<TString> AuthenticatedUser; + + /// + /// @brief Operation start time. + TMaybe<TInstant> StartTime; + + /// + /// @brief Operation finish time (if the operation has finished). + TMaybe<TInstant> FinishTime; + + /// + /// @brief Brief progress of the operation. + TMaybe<TOperationBriefProgress> BriefProgress; + + /// + /// @brief Brief spec of operation (light-weight fields only). + TMaybe<TNode> BriefSpec; + + /// + /// @brief Spec of the operation as provided by the user. + TMaybe<TNode> Spec; + + /// + /// @brief Full spec of operation (all fields not specified by user are filled with default values). + TMaybe<TNode> FullSpec; + + /// + /// @brief Fields not recognized by scheduler. + TMaybe<TNode> UnrecognizedSpec; + + /// + /// @brief Is operation suspended. + TMaybe<bool> Suspended; + + /// + /// @brief Operation result. + TMaybe<TOperationResult> Result; + + /// + /// @brief Operation progress. + TMaybe<TOperationProgress> Progress; + + /// + /// @brief List of operation events (changes of state). + TMaybe<TVector<TOperationEvent>> Events; + + /// + /// @brief Map from alert name to its description. + TMaybe<THashMap<TString, TYtError>> Alerts; +}; + +/// +/// @brief Direction of cursor for paging, see @ref NYT::TListOperationsOptions::CursorDirection. +enum class ECursorDirection +{ + Past /* "past" */, + Future /* "future" */, +}; + +/// +/// @brief Options of @ref NYT::IClient::ListOperations command. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#list_operations +struct TListOperationsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TListOperationsOptions; + /// @endcond + + /// + /// @name Time range specification + /// + /// List operations with start time in half-closed interval + /// `[CursorTime, ToTime)` if `CursorDirection == Future` or + /// `[FromTime, CursorTime)` if `CursorDirection == Past`. + ///@{ + + /// + /// @brief Search for operations with start time >= `FromTime`. + FLUENT_FIELD_OPTION(TInstant, FromTime); + + /// + /// @brief Search for operations with start time < `ToTime`. + FLUENT_FIELD_OPTION(TInstant, ToTime); + + /// + /// @brief Additional restriction on operation start time (useful for pagination). + /// + /// Search for operations with start time >= `CursorTime` if `CursorDirection == Future` + /// and with start time < `CursorTime` if `CursorDirection == Past` + FLUENT_FIELD_OPTION(TInstant, CursorTime); + + /// + /// @brief Direction of pagination (see @ref NYT::TListOperationsOptions::CursorTime). + FLUENT_FIELD_OPTION(ECursorDirection, CursorDirection); + + ///@} + + /// + /// @name Filters + /// Choose operations satisfying given filters. + ///@{ + + /// + /// @brief Search for `Filter` as a substring in operation text factors + /// (e.g. title or input/output table paths). + FLUENT_FIELD_OPTION(TString, Filter); + + /// + /// @brief Choose operations whose pools include `Pool`. + FLUENT_FIELD_OPTION(TString, Pool); + + /// + /// @brief Choose operations with given @ref NYT::TOperationAttributes::AuthenticatedUser. + FLUENT_FIELD_OPTION(TString, User); + + /// + /// @brief Choose operations with given @ref NYT::TOperationAttributes::State. + FLUENT_FIELD_OPTION(TString, State); + + /// + /// @brief Choose operations with given @ref NYT::TOperationAttributes::Type. + FLUENT_FIELD_OPTION(EOperationType, Type); + + /// + /// @brief Choose operations having (or not having) any failed jobs. + FLUENT_FIELD_OPTION(bool, WithFailedJobs); + + ///@} + + /// + /// @brief Search for operations in the archive in addition to Cypress. + FLUENT_FIELD_OPTION(bool, IncludeArchive); + + /// + /// @brief Include the counters for different filter parameters in the response. + /// + /// Include number of operations for each pool, user, state, type + /// and the number of operations having failed jobs. + FLUENT_FIELD_OPTION(bool, IncludeCounters); + + /// + /// @brief Return no more than `Limit` operations (current default and maximum value is 1000). + FLUENT_FIELD_OPTION(i64, Limit); +}; + +/// +/// @brief Response for @ref NYT::IClient::ListOperations command. +struct TListOperationsResult +{ + /// + /// @brief Found operations' attributes. + TVector<TOperationAttributes> Operations; + + /// + /// @name Counters for different filter. + /// + /// If counters were requested (@ref NYT::TListOperationsOptions::IncludeCounters is `true`) + /// the maps contain the number of operations found for each pool, user, state and type. + /// NOTE: + /// 1) Counters ignore CursorTime and CursorDirection, + /// they always are collected in the whole [FromTime, ToTime) interval. + /// 2) Each next counter in the sequence [pool, user, state, type, with_failed_jobs] + /// takes into account all the previous filters (i.e. if you set User filter to "some-user" + /// type counts describe only operations with user "some-user"). + /// @{ + + /// + /// @brief Number of operations for each pool. + TMaybe<THashMap<TString, i64>> PoolCounts; + + /// + /// @brief Number of operations for each user (subject to previous filters). + TMaybe<THashMap<TString, i64>> UserCounts; + + /// + /// @brief Number of operations for each state (subject to previous filters). + TMaybe<THashMap<TString, i64>> StateCounts; + + /// + /// @brief Number of operations for each type (subject to previous filters). + TMaybe<THashMap<EOperationType, i64>> TypeCounts; + + /// + /// @brief Number of operations having failed jobs (subject to all previous filters). + TMaybe<i64> WithFailedJobsCount; + + /// @} + + /// + /// @brief Whether some operations were not returned due to @ref NYT::TListOperationsOptions::Limit. + /// + /// `Incomplete == true` means that not all operations satisfying filters + /// were returned (limit exceeded) and you need to repeat the request with new @ref NYT::TListOperationsOptions::CursorTime + /// (e.g. `CursorTime == *Operations.back().StartTime`, but don't forget to + /// remove the duplicates). + bool Incomplete; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Data source for @ref NYT::IClient::ListJobs command. +enum class EListJobsDataSource : int +{ + Runtime /* "runtime" */, + Archive /* "archive" */, + Auto /* "auto" */, + Manual /* "manual" */, +}; + +/// +/// @brief Job type. +enum class EJobType : int +{ + SchedulerFirst /* "scheduler_first" */, + Map /* "map" */, + PartitionMap /* "partition_map" */, + SortedMerge /* "sorted_merge" */, + OrderedMerge /* "ordered_merge" */, + UnorderedMerge /* "unordered_merge" */, + Partition /* "partition" */, + SimpleSort /* "simple_sort" */, + FinalSort /* "final_sort" */, + SortedReduce /* "sorted_reduce" */, + PartitionReduce /* "partition_reduce" */, + ReduceCombiner /* "reduce_combiner" */, + RemoteCopy /* "remote_copy" */, + IntermediateSort /* "intermediate_sort" */, + OrderedMap /* "ordered_map" */, + JoinReduce /* "join_reduce" */, + Vanilla /* "vanilla" */, + SchedulerUnknown /* "scheduler_unknown" */, + SchedulerLast /* "scheduler_last" */, + ReplicatorFirst /* "replicator_first" */, + ReplicateChunk /* "replicate_chunk" */, + RemoveChunk /* "remove_chunk" */, + RepairChunk /* "repair_chunk" */, + SealChunk /* "seal_chunk" */, + ReplicatorLast /* "replicator_last" */, +}; + +/// +/// @brief Well-known task names. +enum class ETaskName : int +{ + Map /* "map" */, + PartitionMap0 /* "partition_map(0)" */, + SortedMerge /* "sorted_merge" */, + OrderedMerge /* "ordered_merge" */, + UnorderedMerge /* "unordered_merge" */, + Partition0 /* "partition(0)" */, + Partition1 /* "partition(1)" */, + Partition2 /* "partition(2)" */, + SimpleSort /* "simple_sort" */, + FinalSort /* "final_sort" */, + SortedReduce /* "sorted_reduce" */, + PartitionReduce /* "partition_reduce" */, + ReduceCombiner /* "reduce_combiner" */, + RemoteCopy /* "remote_copy" */, + IntermediateSort /* "intermediate_sort" */, + OrderedMap /* "ordered_map" */, + JoinReduce /* "join_reduce" */, +}; + +/// +/// @brief Task name (can either well-known or just a string). +class TTaskName +{ +public: + + // Constructors are implicit by design. + + /// + /// @brief Construct a custom task name. + TTaskName(TString taskName); + + /// + /// @brief Construct a custom task name. + TTaskName(const char* taskName); + + /// + /// @brief Construct a well-known task name. + TTaskName(ETaskName taskName); + + const TString& Get() const; + +private: + TString TaskName_; +}; + +/// +/// @brief Job state. +enum class EJobState : int +{ + None /* "none" */, + Waiting /* "waiting" */, + Running /* "running" */, + Aborting /* "aborting" */, + Completed /* "completed" */, + Failed /* "failed" */, + Aborted /* "aborted" */, + Lost /* "lost" */, +}; + +/// +/// @brief Job sort field. +/// +/// @see @ref NYT::TListJobsOptions. +enum class EJobSortField : int +{ + Type /* "type" */, + State /* "state" */, + StartTime /* "start_time" */, + FinishTime /* "finish_time" */, + Address /* "address" */, + Duration /* "duration" */, + Progress /* "progress" */, + Id /* "id" */, +}; + +/// +/// @brief Job sort direction. +/// +/// @see @ref NYT::TListJobsOptions. +enum class EJobSortDirection : int +{ + Ascending /* "ascending" */, + Descending /* "descending" */, +}; + +/// +/// @brief Options for @ref NYT::IClient::ListJobs. +/// +/// @see https://yt.yandex-team.ru/docs/api/commands.html#list_jobs +struct TListJobsOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TListJobsOptions; + /// @endcond + + /// + /// @name Filters + /// Return only jobs with given value of parameter (type, state, address and existence of stderr). + /// If a field is `Nothing()`, return jobs with all possible values of the corresponding parameter. + /// @{ + + /// + /// @brief Job type. + FLUENT_FIELD_OPTION(EJobType, Type); + + /// + /// @brief Job state. + FLUENT_FIELD_OPTION(EJobState, State); + + /// + /// @brief Address of the cluster node where job was running. + FLUENT_FIELD_OPTION(TString, Address); + + /// + /// @brief Return only jobs whose stderr has been saved. + FLUENT_FIELD_OPTION(bool, WithStderr); + + /// + /// @brief Return only jobs whose spec has been saved. + FLUENT_FIELD_OPTION(bool, WithSpec); + + /// + /// @brief Return only jobs whose fail context has been saved. + FLUENT_FIELD_OPTION(bool, WithFailContext); + + /// @} + + /// + /// @name Sort options + /// @{ + + /// + /// @brief Sort by this field. + FLUENT_FIELD_OPTION(EJobSortField, SortField); + + /// + /// @brief Sort order. + FLUENT_FIELD_OPTION(ESortOrder, SortOrder); + + /// @} + + /// + /// @brief Data source. + /// + /// Where to search for jobs: in scheduler and Cypress ('Runtime'), in archive ('Archive'), + /// automatically basing on operation presence in Cypress ('Auto') or choose manually (`Manual'). + FLUENT_FIELD_OPTION(EListJobsDataSource, DataSource); + + /// @deprecated + FLUENT_FIELD_OPTION(bool, IncludeCypress); + + /// @deprecated + FLUENT_FIELD_OPTION(bool, IncludeControllerAgent); + + /// @deprecated + FLUENT_FIELD_OPTION(bool, IncludeArchive); + + /// + /// @brief Maximum number of jobs to return. + FLUENT_FIELD_OPTION(i64, Limit); + + /// + /// @brief Number of jobs (in specified sort order) to skip. + /// + /// Together with @ref NYT::TListJobsOptions::Limit may be used for pagination. + FLUENT_FIELD_OPTION(i64, Offset); +}; + +/// +/// @brief Description of a core dump that happened in the job. +struct TCoreInfo +{ + i64 ProcessId; + TString ExecutableName; + TMaybe<ui64> Size; + TMaybe<TYtError> Error; +}; + +/// +/// @brief Job attributes. +/// +/// A field may be `Nothing()` if it is not available (i.e. `FinishTime` for a running job). +/// +/// @see https://yt.yandex-team.ru/docs/api/commands#get_job +struct TJobAttributes +{ + /// + /// @brief Job id. + TMaybe<TJobId> Id; + + /// + /// @brief Job type + TMaybe<EJobType> Type; + + /// + /// @brief Job state. + TMaybe<EJobState> State; + + /// + /// @brief Address of a cluster node where job was running. + TMaybe<TString> Address; + + /// + /// @brief The name of the task that job corresponds to. + TMaybe<TString> TaskName; + + /// + /// @brief Job start time. + TMaybe<TInstant> StartTime; + + /// + /// @brief Job finish time (for a finished job). + TMaybe<TInstant> FinishTime; + + /// + /// @brief Estimated ratio of job's completed work. + TMaybe<double> Progress; + + /// + /// @brief Size of saved job stderr. + TMaybe<i64> StderrSize; + + /// + /// @brief Error for a unsuccessfully finished job. + TMaybe<TYtError> Error; + + /// + /// @brief Job brief statistics. + TMaybe<TNode> BriefStatistics; + + /// + /// @brief Job input paths (with ranges). + TMaybe<TVector<TRichYPath>> InputPaths; + + /// + /// @brief Infos for core dumps produced by job. + TMaybe<TVector<TCoreInfo>> CoreInfos; +}; + +/// +/// @brief Response for @ref NYT::IOperation::ListJobs. +struct TListJobsResult +{ + /// + /// @brief Jobs. + TVector<TJobAttributes> Jobs; + + /// + /// @deprecated + TMaybe<i64> CypressJobCount; + + /// + /// @brief Number of jobs retrieved from controller agent. + TMaybe<i64> ControllerAgentJobCount; + + /// + /// @brief Number of jobs retrieved from archive. + TMaybe<i64> ArchiveJobCount; +}; + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Options for @ref NYT::IClient::GetJob. +struct TGetJobOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetJobInput. +struct TGetJobInputOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobInputOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetJobFailContext. +struct TGetJobFailContextOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobFailContextOptions; + /// @endcond +}; + +/// +/// @brief Options for @ref NYT::IClient::GetJobStderr. +struct TGetJobStderrOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetJobStderrOptions; + /// @endcond +}; + +//////////////////////////////////////////////////////////////////// + +/// +/// @brief Options for @ref NYT::IOperation::GetFailedJobInfo. +struct TGetFailedJobInfoOptions +{ + /// @cond Doxygen_Suppress + using TSelf = TGetFailedJobInfoOptions; + /// @endcond + + /// + /// @brief How many jobs to download. Which jobs will be chosen is undefined. + FLUENT_FIELD_DEFAULT(ui64, MaxJobCount, 10); + + /// + /// @brief How much of stderr tail should be downloaded. + FLUENT_FIELD_DEFAULT(ui64, StderrTailSize, 64 * 1024); +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface representing an operation. +struct IOperation + : public TThrRefBase +{ + virtual ~IOperation() = default; + + /// + /// @brief Get operation id. + virtual const TOperationId& GetId() const = 0; + + /// + /// @brief Get URL of the operation in YT Web UI. + virtual TString GetWebInterfaceUrl() const = 0; + + /// + /// @brief Get last error for not started operations. Get state on YT cluster for started operations. + /// + /// For not started operations last error is an error that's being retried during operation + /// preparation/start (e.g. lock files, start operation request). + virtual TString GetStatus() const = 0; + + /// + /// @brief Get preparation future. + /// + /// @return future that is set when operation is prepared. + virtual ::NThreading::TFuture<void> GetPreparedFuture() = 0; + + /// + /// @brief Start operation synchronously. + /// + /// @note: Do NOT call this method twice. + /// + /// If operation is not prepared yet, Start() will block waiting for preparation finish. + /// Be ready to catch exception if operation preparation or start failed. + virtual void Start() = 0; + + /// + /// @brief Is the operation started + /// + /// Returns true if the operation is started on the cluster + virtual bool IsStarted() const = 0; + + /// + /// @brief Get start future. + /// + /// @return future that is set when operation is started. + virtual ::NThreading::TFuture<void> GetStartedFuture() = 0; + + /// + /// @brief Start watching operation. + /// + /// @return future that is set when operation is complete. + /// + /// @note: the user should check value of returned future to ensure that operation completed successfully e.g. + /// @code{.cpp} + /// auto operationComplete = operation->Watch(); + /// operationComplete.Wait(); + /// operationComplete.GetValue(); /// will throw if operation completed with errors + /// @endcode + /// + /// If operation is completed successfully the returned future contains void value. + /// If operation is completed with error future contains @ref NYT::TOperationFailedError. + /// In rare cases when error occurred while waiting (e.g. YT become unavailable) future might contain other exception. + virtual ::NThreading::TFuture<void> Watch() = 0; + + /// + /// @brief Get information about failed jobs. + /// + /// Can be called for operation in any stage. + /// Though user should keep in mind that this method always fetches info from cypress + /// and doesn't work when operation is archived. Successfully completed operations can be archived + /// quite quickly (in about ~30 seconds). + virtual TVector<TFailedJobInfo> GetFailedJobInfo(const TGetFailedJobInfoOptions& options = TGetFailedJobInfoOptions()) = 0; + + /// + /// Get operation brief state. + virtual EOperationBriefState GetBriefState() = 0; + + /// + /// @brief Get error (if operation has failed). + /// + /// @return `Nothing()` if operation is in 'Completed' or 'InProgress' state (or reason for failed / aborted operation). + virtual TMaybe<TYtError> GetError() = 0; + + /// + /// Get job statistics. + virtual TJobStatistics GetJobStatistics() = 0; + + /// + /// Get operation progress. + /// + /// @return `Nothing()` if operation has no running jobs yet, e.g. when it is in "materializing" or "pending" state. + virtual TMaybe<TOperationBriefProgress> GetBriefProgress() = 0; + + /// + /// @brief Abort operation. + /// + /// Operation will be finished immediately. + /// All results of completed/running jobs will be lost. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#abort_op + virtual void AbortOperation() = 0; + + /// + /// @brief Complete operation. + /// + /// Operation will be finished immediately. + /// All results of completed jobs will appear in output tables. + /// All results of running (not completed) jobs will be lost. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#complete_op + virtual void CompleteOperation() = 0; + + /// + /// @brief Suspend operation. + /// + /// Jobs will not be aborted by default, c.f. @ref NYT::TSuspendOperationOptions. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#suspend_op + virtual void SuspendOperation( + const TSuspendOperationOptions& options = TSuspendOperationOptions()) = 0; + + /// + /// @brief Resume previously suspended operation. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#resume_op + virtual void ResumeOperation( + const TResumeOperationOptions& options = TResumeOperationOptions()) = 0; + + /// + /// @brief Get operation attributes. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#get_operation + virtual TOperationAttributes GetAttributes( + const TGetOperationOptions& options = TGetOperationOptions()) = 0; + + /// + /// @brief Update operation runtime parameters. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#update_op_parameters + virtual void UpdateParameters( + const TUpdateOperationParametersOptions& options = TUpdateOperationParametersOptions()) = 0; + + /// + /// @brief Get job attributes. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#get_job + virtual TJobAttributes GetJob( + const TJobId& jobId, + const TGetJobOptions& options = TGetJobOptions()) = 0; + + /// + /// List jobs satisfying given filters (see @ref NYT::TListJobsOptions). + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#list_jobs + virtual TListJobsResult ListJobs( + const TListJobsOptions& options = TListJobsOptions()) = 0; +}; + +/// +/// @brief Interface of client capable of managing operations. +struct IOperationClient +{ + /// + /// @brief Run Map operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/map + IOperationPtr Map( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run Map operation. + /// + /// @param mapper Instance of a job to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/map + IOperationPtr Map( + ::TIntrusivePtr<IMapperBase> mapper, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TMapOperationSpec& spec = TMapOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw Map operation. + /// + /// @param spec Operation spec. + /// @param rawJob Instance of a raw mapper to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/map + virtual IOperationPtr RawMap( + const TRawMapOperationSpec& spec, + ::TIntrusivePtr<IRawJob> rawJob, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Reduce operation. + /// + /// @param spec Operation spec. + /// @param reducer Instance of a job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce + IOperationPtr Reduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run Reduce operation. + /// + /// @param reducer Instance of a job to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param reduceBy Columns to group rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce + IOperationPtr Reduce( + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + const TReduceOperationSpec& spec = TReduceOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw Reduce operation. + /// + /// @param spec Operation spec. + /// @param rawJob Instance of a raw reducer to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/reduce + virtual IOperationPtr RawReduce( + const TRawReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> rawJob, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run JoinReduce operation. + /// + /// @param spec Operation spec. + /// @param reducer Instance of a job to run. + /// @param options Optional parameters. + /// + /// @deprecated Use @ref NYT::IOperationClient::Reduce with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false. + IOperationPtr JoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw JoinReduce operation. + /// + /// @param spec Operation spec. + /// @param rawJob Instance of a raw reducer to run. + /// @param options Optional parameters. + /// + /// @deprecated Use @ref NYT::IOperationClient::RawReduce with @ref NYT::TReduceOperationSpec::EnableKeyGuarantee set to `false. + virtual IOperationPtr RawJoinReduce( + const TRawJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> rawJob, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run MapReduce operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a map job to run (identity mapper if `nullptr`). + /// @param reducer Instance of a reduce job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run MapReduce operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a map job to run (identity mapper if `nullptr`). + /// @param reducerCombiner Instance of a reduce combiner to run (identity reduce combiner if `nullptr`). + /// @param reducer Instance of a reduce job to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run MapReduce operation. + /// + /// @param mapper Instance of mapper to run (identity mapper if `nullptr`). + /// @param reducer Instance of reducer to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param reduceBy Columns to group rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec = TMapReduceOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run MapReduce operation. + /// + /// @param mapper Instance of mapper to run (identity mapper if `nullptr`). + /// @param reduceCombiner Instance of reduceCombiner to run (identity reduce combiner if `nullptr`). + /// @param reducer Instance of reducer to run. + /// @param input Input table(s) + /// @param output Output table(s) + /// @param reduceBy Columns to group rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + IOperationPtr MapReduce( + ::TIntrusivePtr<IMapperBase> mapper, + ::TIntrusivePtr<IReducerBase> reduceCombiner, + ::TIntrusivePtr<IReducerBase> reducer, + const TOneOrMany<TStructuredTablePath>& input, + const TOneOrMany<TStructuredTablePath>& output, + const TSortColumns& reduceBy, + TMapReduceOperationSpec spec = TMapReduceOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run raw MapReduce operation. + /// + /// @param spec Operation spec. + /// @param mapper Instance of a raw mapper to run (identity mapper if `nullptr`). + /// @param mapper Instance of a raw reduce combiner to run (identity reduce combiner if `nullptr`). + /// @param mapper Instance of a raw reducer to run. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/mapreduce + virtual IOperationPtr RawMapReduce( + const TRawMapReduceOperationSpec& spec, + ::TIntrusivePtr<IRawJob> mapper, + ::TIntrusivePtr<IRawJob> reduceCombiner, + ::TIntrusivePtr<IRawJob> reducer, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Sort operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/sort + virtual IOperationPtr Sort( + const TSortOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Sort operation. + /// + /// @param input Input table(s). + /// @param output Output table. + /// @param sortBy Columns to sort input rows by. + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/sort + IOperationPtr Sort( + const TOneOrMany<TRichYPath>& input, + const TRichYPath& output, + const TSortColumns& sortBy, + const TSortOperationSpec& spec = TSortOperationSpec(), + const TOperationOptions& options = TOperationOptions()); + + /// + /// @brief Run Merge operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/merge + virtual IOperationPtr Merge( + const TMergeOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Erase operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/erase + virtual IOperationPtr Erase( + const TEraseOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run RemoteCopy operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/remote_copy + virtual IOperationPtr RemoteCopy( + const TRemoteCopyOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Run Vanilla operation. + /// + /// @param spec Operation spec. + /// @param options Optional parameters. + /// + /// @see https://yt.yandex-team.ru/docs/description/mr/vanilla + virtual IOperationPtr RunVanilla( + const TVanillaOperationSpec& spec, + const TOperationOptions& options = TOperationOptions()) = 0; + + /// + /// @brief Abort operation. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#abort_op + virtual void AbortOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Complete operation. + /// + /// @see https://yt.yandex-team.ru/docs/api/commands#complete_op + virtual void CompleteOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Wait for operation to finish. + virtual void WaitForOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Check and return operation status. + /// + /// @note this function will never return @ref NYT::EOperationBriefState::Failed or @ref NYT::EOperationBriefState::Aborted status, + /// it will throw @ref NYT::TOperationFailedError instead. + virtual EOperationBriefState CheckOperation( + const TOperationId& operationId) = 0; + + /// + /// @brief Create an operation object given operation id. + /// + /// @throw @ref NYT::TErrorResponse if the operation doesn't exist. + virtual IOperationPtr AttachOperation(const TOperationId& operationId) = 0; + +private: + virtual IOperationPtr DoMap( + const TMapOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + const TOperationOptions& options) = 0; + + virtual IOperationPtr DoReduce( + const TReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) = 0; + + virtual IOperationPtr DoJoinReduce( + const TJoinReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) = 0; + + virtual IOperationPtr DoMapReduce( + const TMapReduceOperationSpec& spec, + ::TIntrusivePtr<IStructuredJob> mapper, + ::TIntrusivePtr<IStructuredJob> reduceCombiner, + ::TIntrusivePtr<IStructuredJob> reducer, + const TOperationOptions& options) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + +#define OPERATION_INL_H_ +#include "operation-inl.h" +#undef OPERATION_INL_H_ diff --git a/yt/cpp/mapreduce/interface/operation_ut.cpp b/yt/cpp/mapreduce/interface/operation_ut.cpp new file mode 100644 index 0000000000..0fa62e1568 --- /dev/null +++ b/yt/cpp/mapreduce/interface/operation_ut.cpp @@ -0,0 +1,269 @@ +#include <yt/cpp/mapreduce/interface/common_ut.h> +#include <yt/cpp/mapreduce/interface/job_statistics.h> +#include <yt/cpp/mapreduce/interface/operation.h> +#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; +using namespace NYT::NUnitTesting; + +class TDummyInferenceContext + : public IOperationPreparationContext +{ +public: + TDummyInferenceContext(int inputCount, int outputCount) + : InputCount_(inputCount) + , OutputCount_(outputCount) + , InputSchemas_(inputCount) + { } + + int GetInputCount() const override + { + return InputCount_; + } + + int GetOutputCount() const override + { + return OutputCount_; + } + + const TVector<TTableSchema>& GetInputSchemas() const override + { + return InputSchemas_; + } + + const TTableSchema& GetInputSchema(int index) const override + { + return InputSchemas_[index]; + } + + TMaybe<TYPath> GetInputPath(int) const override + { + return Nothing(); + } + + TMaybe<TYPath> GetOutputPath(int) const override + { + return Nothing(); + } + +private: + int InputCount_; + int OutputCount_; + TVector<TTableSchema> InputSchemas_; +}; + +Y_UNIT_TEST_SUITE(PrepareOperation) +{ + + Y_UNIT_TEST(BasicSchemas) + { + auto firstSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64)); + auto otherSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("other_column").Type(EValueType::VT_BOOLEAN)); + auto thirdSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("third_column").Type(EValueType::VT_STRING)); + + TDummyInferenceContext context(3,7); + TJobOperationPreparer builder(context); + + builder + .OutputSchema(1, firstSchema) + .BeginOutputGroup(TVector<int>{2, 5}) + .Schema(otherSchema) + .EndOutputGroup() + .BeginOutputGroup(3, 5) + .Schema(thirdSchema) + .EndOutputGroup() + .BeginOutputGroup(TVector<int>{0, 6}) + .Schema(thirdSchema) + .EndOutputGroup(); + + UNIT_ASSERT_EXCEPTION(builder.OutputSchema(1, otherSchema), TApiUsageError); + UNIT_ASSERT_EXCEPTION(builder.BeginOutputGroup(3, 5).Schema(otherSchema), TApiUsageError); + UNIT_ASSERT_EXCEPTION(builder.BeginOutputGroup(TVector<int>{3,6,7}).Schema(otherSchema), TApiUsageError); + + builder.Finish(); + auto result = builder.GetOutputSchemas(); + + ASSERT_SERIALIZABLES_EQUAL(result[0], thirdSchema); + ASSERT_SERIALIZABLES_EQUAL(result[1], firstSchema); + ASSERT_SERIALIZABLES_EQUAL(result[2], otherSchema); + ASSERT_SERIALIZABLES_EQUAL(result[3], thirdSchema); + ASSERT_SERIALIZABLES_EQUAL(result[4], thirdSchema); + ASSERT_SERIALIZABLES_EQUAL(result[5], otherSchema); + ASSERT_SERIALIZABLES_EQUAL(result[6], thirdSchema); + } + + Y_UNIT_TEST(NoSchema) + { + auto schema = TTableSchema() + .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64)); + + TDummyInferenceContext context(3,4); + TJobOperationPreparer builder(context); + + builder + .OutputSchema(1, schema) + .NoOutputSchema(0) + .BeginOutputGroup(2, 4) + .Schema(schema) + .EndOutputGroup(); + + UNIT_ASSERT_EXCEPTION(builder.OutputSchema(0, schema), TApiUsageError); + + builder.Finish(); + auto result = builder.GetOutputSchemas(); + + UNIT_ASSERT(result[0].Empty()); + + ASSERT_SERIALIZABLES_EQUAL(result[1], schema); + ASSERT_SERIALIZABLES_EQUAL(result[2], schema); + ASSERT_SERIALIZABLES_EQUAL(result[3], schema); + } + + Y_UNIT_TEST(Descriptions) + { + auto urlRowSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("Host").Type(NTi::Optional(NTi::String()))) + .AddColumn(TColumnSchema().Name("Path").Type(NTi::Optional(NTi::String()))) + .AddColumn(TColumnSchema().Name("HttpCode").Type(NTi::Optional(NTi::Int32()))); + + auto urlRowStruct = NTi::Struct({ + {"Host", NTi::Optional(NTi::String())}, + {"Path", NTi::Optional(NTi::String())}, + {"HttpCode", NTi::Optional(NTi::Int32())}, + }); + + auto rowFieldSerializationOptionSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(NTi::Optional(urlRowStruct))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(NTi::Optional(NTi::String()))); + + auto rowSerializedRepeatedFieldsSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("Ints").Type(NTi::List(NTi::Int64()))) + .AddColumn(TColumnSchema().Name("UrlRows").Type(NTi::List(urlRowStruct))); + + TDummyInferenceContext context(5,7); + TJobOperationPreparer builder(context); + + builder + .InputDescription<TUrlRow>(0) + .BeginInputGroup(2, 3) + .Description<TUrlRow>() + .EndInputGroup() + .BeginInputGroup(TVector<int>{1, 4}) + .Description<TRowSerializedRepeatedFields>() + .EndInputGroup() + .InputDescription<TUrlRow>(3); + + UNIT_ASSERT_EXCEPTION(builder.InputDescription<TUrlRow>(0), TApiUsageError); + + builder + .OutputDescription<TUrlRow>(0, false) + .OutputDescription<TRowFieldSerializationOption>(1) + .BeginOutputGroup(2, 4) + .Description<TUrlRow>() + .EndOutputGroup() + .BeginOutputGroup(TVector<int>{4,6}) + .Description<TRowSerializedRepeatedFields>() + .EndOutputGroup() + .OutputDescription<TUrlRow>(5, false); + + UNIT_ASSERT_EXCEPTION(builder.OutputDescription<TUrlRow>(0), TApiUsageError); + UNIT_ASSERT_NO_EXCEPTION(builder.OutputSchema(0, urlRowSchema)); + UNIT_ASSERT_NO_EXCEPTION(builder.OutputSchema(5, urlRowSchema)); + UNIT_ASSERT_EXCEPTION(builder.OutputSchema(1, urlRowSchema), TApiUsageError); + + builder.Finish(); + auto result = builder.GetOutputSchemas(); + + ASSERT_SERIALIZABLES_EQUAL(result[0], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[1], rowFieldSerializationOptionSchema); + ASSERT_SERIALIZABLES_EQUAL(result[2], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[3], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[4], rowSerializedRepeatedFieldsSchema); + ASSERT_SERIALIZABLES_EQUAL(result[5], urlRowSchema); + ASSERT_SERIALIZABLES_EQUAL(result[6], rowSerializedRepeatedFieldsSchema); + + auto expectedInputDescriptions = TVector<TMaybe<TTableStructure>>{ + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + }; + UNIT_ASSERT_EQUAL(expectedInputDescriptions, builder.GetInputDescriptions()); + + auto expectedOutputDescriptions = TVector<TMaybe<TTableStructure>>{ + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowFieldSerializationOption::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + {TProtobufTableStructure{TUrlRow::descriptor()}}, + {TProtobufTableStructure{TRowSerializedRepeatedFields::descriptor()}}, + }; + UNIT_ASSERT_EQUAL(expectedOutputDescriptions, builder.GetOutputDescriptions()); + } + + Y_UNIT_TEST(InputColumns) + { + TDummyInferenceContext context(5, 1); + TJobOperationPreparer builder(context); + builder + .InputColumnFilter(2, {"a", "b"}) + .BeginInputGroup(0, 2) + .ColumnFilter({"b", "c"}) + .ColumnRenaming({{"b", "B"}, {"c", "C"}}) + .EndInputGroup() + .InputColumnRenaming(3, {{"a", "AAA"}}) + .NoOutputSchema(0); + builder.Finish(); + + auto expectedRenamings = TVector<THashMap<TString, TString>>{ + {{"b", "B"}, {"c", "C"}}, + {{"b", "B"}, {"c", "C"}}, + {}, + {{"a", "AAA"}}, + {}, + }; + UNIT_ASSERT_EQUAL(builder.GetInputColumnRenamings(), expectedRenamings); + + auto expectedFilters = TVector<TMaybe<TVector<TString>>>{ + {{"b", "c"}}, + {{"b", "c"}}, + {{"a", "b"}}, + {}, + {}, + }; + UNIT_ASSERT_EQUAL(builder.GetInputColumnFilters(), expectedFilters); + } + + Y_UNIT_TEST(Bug_r7349102) + { + auto firstSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("some_column").Type(EValueType::VT_UINT64)); + auto otherSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("other_column").Type(EValueType::VT_BOOLEAN)); + auto thirdSchema = TTableSchema() + .AddColumn(TColumnSchema().Name("third_column").Type(EValueType::VT_STRING)); + + TDummyInferenceContext context(3,1); + TJobOperationPreparer builder(context); + + builder + .InputDescription<TUrlRow>(0) + .InputDescription<TUrlRow>(1) + .InputDescription<TUrlRow>(2) + .OutputDescription<TUrlRow>(0); + + builder.Finish(); + } + +} // Y_UNIT_TEST_SUITE(SchemaInference) diff --git a/yt/cpp/mapreduce/interface/proto3_ut.proto b/yt/cpp/mapreduce/interface/proto3_ut.proto new file mode 100644 index 0000000000..b24c13085b --- /dev/null +++ b/yt/cpp/mapreduce/interface/proto3_ut.proto @@ -0,0 +1,17 @@ +syntax = "proto3"; + +import "yt/yt_proto/yt/formats/extension.proto"; + +package NYT.NTestingProto3; + +option (NYT.file_default_field_flags) = SERIALIZATION_YT; + +message TWithOptional +{ + optional int64 x = 1; +} + +message TWithOptionalMessage +{ + optional TWithOptional x = 1; +} diff --git a/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp new file mode 100644 index 0000000000..5ffa9564d7 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.cpp @@ -0,0 +1,271 @@ +#include "errors.h" +#include "format.h" +#include "common_ut.h" + +#include <yt/cpp/mapreduce/interface/protobuf_file_options_ut.pb.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(ProtobufFileOptions) +{ + NTi::TTypePtr GetUrlRowType(bool required) + { + static const NTi::TTypePtr structType = NTi::Struct({ + {"Host", ToTypeV3(EValueType::VT_STRING, false)}, + {"Path", ToTypeV3(EValueType::VT_STRING, false)}, + {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}}); + return required ? structType : NTi::TTypePtr(NTi::Optional(structType)); + } + + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TRowFieldSerializationOption>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false)))); + } + + Y_UNIT_TEST(TRowMixedSerializationOptions) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TRowMixedSerializationOptions>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false)))); + } + + Y_UNIT_TEST(FieldSortOrder) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TFieldSortOrder>(); + + auto asInProtoFile = NTi::Optional(NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + {"z", NTi::Optional(NTi::Bool())}, + })); + auto byFieldNumber = NTi::Optional(NTi::Struct({ + {"z", NTi::Optional(NTi::Bool())}, + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + })); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("EmbeddedDefault").Type(asInProtoFile)) + .AddColumn(TColumnSchema().Name("EmbeddedAsInProtoFile").Type(asInProtoFile)) + .AddColumn(TColumnSchema().Name("EmbeddedByFieldNumber").Type(byFieldNumber))); + } + + Y_UNIT_TEST(Map) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TWithMap>(); + + auto createKeyValueStruct = [] (NTi::TTypePtr key, NTi::TTypePtr value) { + return NTi::List(NTi::Struct({ + {"key", NTi::Optional(key)}, + {"value", NTi::Optional(value)}, + })); + }; + + auto embedded = NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + }); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("MapDefault") + .Type(createKeyValueStruct(NTi::Int64(), embedded))) + .AddColumn(TColumnSchema() + .Name("MapDict") + .Type(NTi::Dict(NTi::Int64(), embedded)))); + } + + Y_UNIT_TEST(Oneof) + { + const auto schema = CreateTableSchema<NTestingFileOptions::TWithOneof>(); + + auto embedded = NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + }); + + auto defaultVariantType = NTi::Optional(NTi::Struct({ + {"field", NTi::Optional(NTi::String())}, + {"Oneof2", NTi::Optional(NTi::Variant(NTi::Struct({ + {"y2", NTi::String()}, + {"z2", embedded}, + {"x2", NTi::Int64()}, + })))}, + {"x1", NTi::Optional(NTi::Int64())}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(embedded)}, + })); + + auto noDefaultType = NTi::Optional(NTi::Struct({ + {"field", NTi::Optional(NTi::String())}, + {"y2", NTi::Optional(NTi::String())}, + {"z2", NTi::Optional(embedded)}, + {"x2", NTi::Optional(NTi::Int64())}, + {"x1", NTi::Optional(NTi::Int64())}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(embedded)}, + })); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("DefaultVariant") + .Type(defaultVariantType) + ) + .AddColumn(TColumnSchema() + .Name("NoDefault") + .Type(noDefaultType) + ) + .AddColumn(TColumnSchema() + .Name("SerializationProtobuf") + .Type(NTi::Optional(NTi::Struct({ + {"x1", NTi::Optional(NTi::Int64())}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(NTi::String())}, + }))) + ) + .AddColumn(TColumnSchema() + .Name("MemberOfTopLevelOneof") + .Type(NTi::Optional(NTi::Int64())) + ) + ); + } +} + +static TNode GetColumns(const TFormat& format, int tableIndex = 0) +{ + return format.Config.GetAttributes()["tables"][tableIndex]["columns"]; +} + +Y_UNIT_TEST_SUITE(ProtobufFormatFileOptions) +{ + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto format = TFormat::Protobuf<NTestingFileOptions::TRowFieldSerializationOption>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns[0]["name"], "UrlRow_1"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["proto_type"], "message"); + UNIT_ASSERT_VALUES_EQUAL(columns[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(columns[1]["name"], "UrlRow_2"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(columns[1]["field_number"], 2); + const auto& fields = columns[1]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "Host"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[0]["field_number"], 1); + + UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "Path"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["proto_type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(fields[1]["field_number"], 2); + + UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "HttpCode"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "sint32"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["field_number"], 3); + } + + Y_UNIT_TEST(Map) + { + const auto format = TFormat::Protobuf<NTestingFileOptions::TWithMap>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 2); + { + const auto& column = columns[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDefault"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + { + const auto& column = columns[1]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MapDict"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 2); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["proto_type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["proto_type"], "structured_message"); + } + } + + Y_UNIT_TEST(Oneof) + { + const auto format = TFormat::Protobuf<NTestingFileOptions::TWithOneof>(); + auto columns = GetColumns(format); + + UNIT_ASSERT_VALUES_EQUAL(columns.Size(), 4); + + { + const auto& column = columns[0]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "DefaultVariant"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 5); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "field"); + + const auto& oneof2 = column["fields"][1]; + UNIT_ASSERT_VALUES_EQUAL(oneof2["name"], "Oneof2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["proto_type"], "oneof"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][0]["name"], "y2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["name"], "z2"); + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][1]["proto_type"], "structured_message"); + const auto& embeddedFields = oneof2["fields"][1]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[1]["name"], "y"); + + UNIT_ASSERT_VALUES_EQUAL(oneof2["fields"][2]["name"], "x2"); + + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][3]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][4]["name"], "z1"); + }; + + { + const auto& column = columns[1]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "NoDefault"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + const auto& fields = column["fields"]; + UNIT_ASSERT_VALUES_EQUAL(fields.Size(), 7); + + UNIT_ASSERT_VALUES_EQUAL(fields[0]["name"], "field"); + + UNIT_ASSERT_VALUES_EQUAL(fields[1]["name"], "y2"); + + UNIT_ASSERT_VALUES_EQUAL(fields[2]["name"], "z2"); + UNIT_ASSERT_VALUES_EQUAL(fields[2]["proto_type"], "structured_message"); + const auto& embeddedFields = fields[2]["fields"]; + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[0]["name"], "x"); + UNIT_ASSERT_VALUES_EQUAL(embeddedFields[1]["name"], "y"); + + UNIT_ASSERT_VALUES_EQUAL(fields[3]["name"], "x2"); + + UNIT_ASSERT_VALUES_EQUAL(fields[4]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(fields[5]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(fields[6]["name"], "z1"); + }; + + { + const auto& column = columns[2]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "SerializationProtobuf"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "structured_message"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"].Size(), 3); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][0]["name"], "x1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][1]["name"], "y1"); + UNIT_ASSERT_VALUES_EQUAL(column["fields"][2]["name"], "z1"); + } + { + const auto& column = columns[3]; + UNIT_ASSERT_VALUES_EQUAL(column["name"], "MemberOfTopLevelOneof"); + UNIT_ASSERT_VALUES_EQUAL(column["proto_type"], "int64"); + } + } +} diff --git a/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto new file mode 100644 index 0000000000..4804b2f60c --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_file_options_ut.proto @@ -0,0 +1,142 @@ +import "yt/yt_proto/yt/formats/extension.proto"; + +package NYT.NTestingFileOptions; + +option (NYT.file_default_field_flags) = SERIALIZATION_YT; +option (NYT.file_default_field_flags) = MAP_AS_LIST_OF_STRUCTS; +option (NYT.file_default_message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; +option (NYT.file_default_oneof_flags) = SEPARATE_FIELDS; + +message TUrlRow +{ + optional string Host = 1 [(NYT.column_name) = "Host"]; + optional string Path = 2 [(NYT.column_name) = "Path"]; + optional sint32 HttpCode = 3 [(NYT.column_name) = "HttpCode"]; +} + +message TRowFieldSerializationOption +{ + optional TUrlRow UrlRow_1 = 1 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TUrlRow UrlRow_2 = 2; +} + +message TRowMixedSerializationOptions +{ + option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF; + optional TUrlRow UrlRow_1 = 1; + optional TUrlRow UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_YT]; +} + +message TRowSerializedRepeatedFields +{ + repeated int64 Ints = 1; + repeated TUrlRow UrlRows = 2; +} + +message TFieldSortOrder +{ + message TEmbeddedDefault { + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedAsInProtoFile { + option (NYT.message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedByFieldNumber { + option (NYT.message_flags) = SORT_FIELDS_BY_FIELD_NUMBER; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional TEmbeddedDefault EmbeddedDefault = 1; + optional TEmbeddedAsInProtoFile EmbeddedAsInProtoFile = 2; + optional TEmbeddedByFieldNumber EmbeddedByFieldNumber = 3; +} + +message TWithMap +{ + message TEmbedded { + optional int64 x = 1; + optional string y = 2; + } + + map<int64, TEmbedded> MapDefault = 1; + map<int64, TEmbedded> MapDict = 5 [(NYT.flags) = MAP_AS_DICT]; +} + +message TWithOneof +{ + message TEmbedded + { + oneof Oneof { + int64 x = 1; + string y = 2; + } + } + + message TDefaultVariant + { + option (NYT.default_oneof_flags) = VARIANT; + optional string field = 1; + + oneof Oneof2 + { + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + option (NYT.oneof_flags) = SEPARATE_FIELDS; + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TNoDefault + { + optional string field = 1; + + oneof Oneof2 + { + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TSerializationProtobuf + { + option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF; + oneof Oneof + { + int64 x1 = 2; + string y1 = 1; + TEmbedded z1 = 3; + } + } + + optional TDefaultVariant DefaultVariant = 1; + optional TNoDefault NoDefault = 2; + optional TSerializationProtobuf SerializationProtobuf = 3; + + oneof TopLevelOneof + { + int64 MemberOfTopLevelOneof = 4; + } +} diff --git a/yt/cpp/mapreduce/interface/protobuf_format.cpp b/yt/cpp/mapreduce/interface/protobuf_format.cpp new file mode 100644 index 0000000000..3d57ed2797 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_format.cpp @@ -0,0 +1,1498 @@ +#include "protobuf_format.h" + +#include "errors.h" + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <google/protobuf/text_format.h> + +#include <library/cpp/yson/node/node_io.h> + +#include <util/generic/hash_set.h> +#include <util/generic/stack.h> +#include <util/generic/overloaded.h> + +#include <util/stream/output.h> +#include <util/stream/file.h> + +namespace NYT::NDetail { + +using ::google::protobuf::Descriptor; +using ::google::protobuf::DescriptorProto; +using ::google::protobuf::EnumDescriptor; +using ::google::protobuf::EnumDescriptorProto; +using ::google::protobuf::FieldDescriptor; +using ::google::protobuf::FieldDescriptorProto; +using ::google::protobuf::OneofDescriptor; +using ::google::protobuf::Message; +using ::google::protobuf::FileDescriptor; +using ::google::protobuf::FileDescriptorProto; +using ::google::protobuf::FileDescriptorSet; +using ::google::protobuf::FieldOptions; +using ::google::protobuf::FileOptions; +using ::google::protobuf::OneofOptions; +using ::google::protobuf::MessageOptions; + +using ::ToString; + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +using TOneofOption = std::variant< + EProtobufOneofMode>; + +using TFieldOption = std::variant< + EProtobufType, + EProtobufSerializationMode, + EProtobufListMode, + EProtobufMapMode, + EProtobufEnumWritingMode>; + +using TMessageOption = std::variant< + EProtobufFieldSortOrder>; + +struct TOtherColumns +{ }; + +using TValueTypeOrOtherColumns = std::variant<EValueType, TOtherColumns>; + +//////////////////////////////////////////////////////////////////////////////// + +TFieldOption FieldFlagToOption(EWrapperFieldFlag::Enum flag) +{ + using EFlag = EWrapperFieldFlag; + switch (flag) { + case EFlag::SERIALIZATION_PROTOBUF: + return EProtobufSerializationMode::Protobuf; + case EFlag::SERIALIZATION_YT: + return EProtobufSerializationMode::Yt; + + case EFlag::ANY: + return EProtobufType::Any; + case EFlag::OTHER_COLUMNS: + return EProtobufType::OtherColumns; + case EFlag::ENUM_INT: + return EProtobufType::EnumInt; + case EFlag::ENUM_STRING: + return EProtobufType::EnumString; + + case EFlag::OPTIONAL_LIST: + return EProtobufListMode::Optional; + case EFlag::REQUIRED_LIST: + return EProtobufListMode::Required; + + case EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY: + return EProtobufMapMode::ListOfStructsLegacy; + case EFlag::MAP_AS_LIST_OF_STRUCTS: + return EProtobufMapMode::ListOfStructs; + case EFlag::MAP_AS_DICT: + return EProtobufMapMode::Dict; + case EFlag::MAP_AS_OPTIONAL_DICT: + return EProtobufMapMode::OptionalDict; + case EFlag::EMBEDDED: + return EProtobufSerializationMode::Embedded; + + case EFlag::ENUM_SKIP_UNKNOWN_VALUES: + return EProtobufEnumWritingMode::SkipUnknownValues; + case EFlag::ENUM_CHECK_VALUES: + return EProtobufEnumWritingMode::CheckValues; + } + Y_FAIL(); +} + +TMessageOption MessageFlagToOption(EWrapperMessageFlag::Enum flag) +{ + using EFlag = EWrapperMessageFlag; + switch (flag) { + case EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE: + return EProtobufFieldSortOrder::AsInProtoFile; + case EFlag::SORT_FIELDS_BY_FIELD_NUMBER: + return EProtobufFieldSortOrder::ByFieldNumber; + } + Y_FAIL(); +} + +TOneofOption OneofFlagToOption(EWrapperOneofFlag::Enum flag) +{ + using EFlag = EWrapperOneofFlag; + switch (flag) { + case EFlag::SEPARATE_FIELDS: + return EProtobufOneofMode::SeparateFields; + case EFlag::VARIANT: + return EProtobufOneofMode::Variant; + } + Y_FAIL(); +} + +EWrapperFieldFlag::Enum OptionToFieldFlag(TFieldOption option) +{ + using EFlag = EWrapperFieldFlag; + struct TVisitor + { + EFlag::Enum operator() (EProtobufType type) + { + switch (type) { + case EProtobufType::Any: + return EFlag::ANY; + case EProtobufType::OtherColumns: + return EFlag::OTHER_COLUMNS; + case EProtobufType::EnumInt: + return EFlag::ENUM_INT; + case EProtobufType::EnumString: + return EFlag::ENUM_STRING; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufSerializationMode serializationMode) + { + switch (serializationMode) { + case EProtobufSerializationMode::Yt: + return EFlag::SERIALIZATION_YT; + case EProtobufSerializationMode::Protobuf: + return EFlag::SERIALIZATION_PROTOBUF; + case EProtobufSerializationMode::Embedded: + return EFlag::EMBEDDED; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufListMode listMode) + { + switch (listMode) { + case EProtobufListMode::Optional: + return EFlag::OPTIONAL_LIST; + case EProtobufListMode::Required: + return EFlag::REQUIRED_LIST; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufMapMode mapMode) + { + switch (mapMode) { + case EProtobufMapMode::ListOfStructsLegacy: + return EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY; + case EProtobufMapMode::ListOfStructs: + return EFlag::MAP_AS_LIST_OF_STRUCTS; + case EProtobufMapMode::Dict: + return EFlag::MAP_AS_DICT; + case EProtobufMapMode::OptionalDict: + return EFlag::MAP_AS_OPTIONAL_DICT; + } + Y_FAIL(); + } + EFlag::Enum operator() (EProtobufEnumWritingMode enumWritingMode) + { + switch (enumWritingMode) { + case EProtobufEnumWritingMode::SkipUnknownValues: + return EFlag::ENUM_SKIP_UNKNOWN_VALUES; + case EProtobufEnumWritingMode::CheckValues: + return EFlag::ENUM_CHECK_VALUES; + } + Y_FAIL(); + } + }; + + return std::visit(TVisitor(), option); +} + +EWrapperMessageFlag::Enum OptionToMessageFlag(TMessageOption option) +{ + using EFlag = EWrapperMessageFlag; + struct TVisitor + { + EFlag::Enum operator() (EProtobufFieldSortOrder sortOrder) + { + switch (sortOrder) { + case EProtobufFieldSortOrder::AsInProtoFile: + return EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; + case EProtobufFieldSortOrder::ByFieldNumber: + return EFlag::SORT_FIELDS_BY_FIELD_NUMBER; + } + Y_FAIL(); + } + }; + + return std::visit(TVisitor(), option); +} + +EWrapperOneofFlag::Enum OptionToOneofFlag(TOneofOption option) +{ + using EFlag = EWrapperOneofFlag; + struct TVisitor + { + EFlag::Enum operator() (EProtobufOneofMode mode) + { + switch (mode) { + case EProtobufOneofMode::SeparateFields: + return EFlag::SEPARATE_FIELDS; + case EProtobufOneofMode::Variant: + return EFlag::VARIANT; + } + Y_FAIL(); + } + }; + + return std::visit(TVisitor(), option); +} + + +template <typename T, typename TOptionToFlag> +void SetOption(TMaybe<T>& option, T newOption, TOptionToFlag optionToFlag) +{ + if (option) { + if (*option == newOption) { + ythrow yexception() << "Duplicate protobuf flag " << optionToFlag(newOption); + } else { + ythrow yexception() << "Incompatible protobuf flags " << + optionToFlag(*option) << " and " << optionToFlag(newOption); + } + } + option = newOption; +} + +class TParseProtobufFieldOptionsVisitor +{ +public: + void operator() (EProtobufType type) + { + SetOption(Type, type); + } + + void operator() (EProtobufSerializationMode serializationMode) + { + SetOption(SerializationMode, serializationMode); + } + + void operator() (EProtobufListMode listMode) + { + SetOption(ListMode, listMode); + } + + void operator() (EProtobufMapMode mapMode) + { + SetOption(MapMode, mapMode); + } + + void operator() (EProtobufEnumWritingMode enumWritingMode) + { + SetOption(EnumWritingMode, enumWritingMode); + } + + template <typename T> + void SetOption(TMaybe<T>& option, T newOption) + { + NYT::NDetail::SetOption(option, newOption, OptionToFieldFlag); + } + +public: + TMaybe<EProtobufType> Type; + TMaybe<EProtobufSerializationMode> SerializationMode; + TMaybe<EProtobufListMode> ListMode; + TMaybe<EProtobufMapMode> MapMode; + TMaybe<EProtobufEnumWritingMode> EnumWritingMode; +}; + +class TParseProtobufMessageOptionsVisitor +{ +public: + void operator() (EProtobufFieldSortOrder fieldSortOrder) + { + SetOption(FieldSortOrder, fieldSortOrder); + } + + template <typename T> + void SetOption(TMaybe<T>& option, T newOption) + { + NYT::NDetail::SetOption(option, newOption, OptionToMessageFlag); + } + +public: + TMaybe<EProtobufFieldSortOrder> FieldSortOrder; +}; + +class TParseProtobufOneofOptionsVisitor +{ +public: + void operator() (EProtobufOneofMode mode) + { + SetOption(Mode, mode); + } + + template <typename T> + void SetOption(TMaybe<T>& option, T newOption) + { + NYT::NDetail::SetOption(option, newOption, OptionToOneofFlag); + } + +public: + TMaybe<EProtobufOneofMode> Mode; +}; + +void ParseProtobufFieldOptions( + const ::google::protobuf::RepeatedField<EWrapperFieldFlag::Enum>& flags, + TProtobufFieldOptions* fieldOptions) +{ + TParseProtobufFieldOptionsVisitor visitor; + for (auto flag : flags) { + std::visit(visitor, FieldFlagToOption(flag)); + } + if (visitor.Type) { + fieldOptions->Type = *visitor.Type; + } + if (visitor.SerializationMode) { + fieldOptions->SerializationMode = *visitor.SerializationMode; + } + if (visitor.ListMode) { + fieldOptions->ListMode = *visitor.ListMode; + } + if (visitor.MapMode) { + fieldOptions->MapMode = *visitor.MapMode; + } +} + +void ParseProtobufMessageOptions( + const ::google::protobuf::RepeatedField<EWrapperMessageFlag::Enum>& flags, + TProtobufMessageOptions* messageOptions) +{ + TParseProtobufMessageOptionsVisitor visitor; + for (auto flag : flags) { + std::visit(visitor, MessageFlagToOption(flag)); + } + if (visitor.FieldSortOrder) { + messageOptions->FieldSortOrder = *visitor.FieldSortOrder; + } +} + +void ParseProtobufOneofOptions( + const ::google::protobuf::RepeatedField<EWrapperOneofFlag::Enum>& flags, + TProtobufOneofOptions* messageOptions) +{ + TParseProtobufOneofOptionsVisitor visitor; + for (auto flag : flags) { + std::visit(visitor, OneofFlagToOption(flag)); + } + if (visitor.Mode) { + messageOptions->Mode = *visitor.Mode; + } +} + +TProtobufFieldOptions GetDefaultFieldOptions( + const Descriptor* descriptor, + TProtobufFieldOptions defaultFieldOptions = {}) +{ + ParseProtobufFieldOptions( + descriptor->file()->options().GetRepeatedExtension(file_default_field_flags), + &defaultFieldOptions); + ParseProtobufFieldOptions( + descriptor->options().GetRepeatedExtension(default_field_flags), + &defaultFieldOptions); + return defaultFieldOptions; +} + +TProtobufOneofOptions GetDefaultOneofOptions(const Descriptor* descriptor) +{ + TProtobufOneofOptions defaultOneofOptions; + ParseProtobufOneofOptions( + descriptor->file()->options().GetRepeatedExtension(file_default_oneof_flags), + &defaultOneofOptions); + ParseProtobufOneofOptions( + descriptor->options().GetRepeatedExtension(default_oneof_flags), + &defaultOneofOptions); + switch (defaultOneofOptions.Mode) { + case EProtobufOneofMode::Variant: { + auto defaultFieldOptions = GetDefaultFieldOptions(descriptor); + switch (defaultFieldOptions.SerializationMode) { + case EProtobufSerializationMode::Protobuf: + // For Protobuf serialization mode default is SeparateFields. + defaultOneofOptions.Mode = EProtobufOneofMode::SeparateFields; + return defaultOneofOptions; + case EProtobufSerializationMode::Yt: + case EProtobufSerializationMode::Embedded: + return defaultOneofOptions; + } + Y_FAIL(); + } + case EProtobufOneofMode::SeparateFields: + return defaultOneofOptions; + } + Y_FAIL(); +} + +//////////////////////////////////////////////////////////////////////////////// + +void ValidateProtobufType(const FieldDescriptor& fieldDescriptor, EProtobufType protobufType) +{ + const auto fieldType = fieldDescriptor.type(); + auto ensureType = [&] (FieldDescriptor::Type expectedType) { + Y_ENSURE(fieldType == expectedType, + "Type of field " << fieldDescriptor.name() << "does not match specified field flag " << + OptionToFieldFlag(protobufType) << ": " + "expected " << FieldDescriptor::TypeName(expectedType) << ", " << + "got " << FieldDescriptor::TypeName(fieldType)); + }; + switch (protobufType) { + case EProtobufType::Any: + ensureType(FieldDescriptor::TYPE_BYTES); + return; + case EProtobufType::OtherColumns: + ensureType(FieldDescriptor::TYPE_BYTES); + return; + case EProtobufType::EnumInt: + ensureType(FieldDescriptor::TYPE_ENUM); + return; + case EProtobufType::EnumString: + ensureType(FieldDescriptor::TYPE_ENUM); + return; + } + Y_FAIL(); +} + +//////////////////////////////////////////////////////////////////////////////// + +class TCycleChecker +{ +private: + class TGuard + { + public: + TGuard(TCycleChecker* checker, const Descriptor* descriptor) + : Checker_(checker) + , Descriptor_(descriptor) + { + Checker_->ActiveVertices_.insert(Descriptor_); + Checker_->Stack_.push(Descriptor_); + } + + ~TGuard() + { + Checker_->ActiveVertices_.erase(Descriptor_); + Checker_->Stack_.pop(); + } + + private: + TCycleChecker* Checker_; + const Descriptor* Descriptor_; + }; + +public: + [[nodiscard]] TGuard Enter(const Descriptor* descriptor) + { + if (ActiveVertices_.contains(descriptor)) { + Y_VERIFY(!Stack_.empty()); + ythrow TApiUsageError() << "Cyclic reference found for protobuf messages. " << + "Consider removing " << EWrapperFieldFlag::SERIALIZATION_YT << " flag " << + "somewhere on the cycle containing " << + Stack_.top()->full_name() << " and " << descriptor->full_name(); + } + return TGuard(this, descriptor); + } + +private: + THashSet<const Descriptor*> ActiveVertices_; + TStack<const Descriptor*> Stack_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +TProtobufFieldOptions GetFieldOptions( + const FieldDescriptor* fieldDescriptor, + const TMaybe<TProtobufFieldOptions>& defaultFieldOptions) +{ + TProtobufFieldOptions options; + if (defaultFieldOptions) { + options = *defaultFieldOptions; + } else { + options = GetDefaultFieldOptions(fieldDescriptor->containing_type()); + } + ParseProtobufFieldOptions(fieldDescriptor->options().GetRepeatedExtension(flags), &options); + return options; +} + +TProtobufOneofOptions GetOneofOptions( + const OneofDescriptor* oneofDescriptor, + const TMaybe<TProtobufOneofOptions>& defaultOneofOptions) +{ + TProtobufOneofOptions options; + if (defaultOneofOptions) { + options = *defaultOneofOptions; + } else { + options = GetDefaultOneofOptions(oneofDescriptor->containing_type()); + } + ParseProtobufOneofOptions(oneofDescriptor->options().GetRepeatedExtension(oneof_flags), &options); + + if (oneofDescriptor->is_synthetic()) { + options.Mode = EProtobufOneofMode::SeparateFields; + } + + auto variantFieldName = oneofDescriptor->options().GetExtension(variant_field_name); + switch (options.Mode) { + case EProtobufOneofMode::SeparateFields: + if (variantFieldName) { + ythrow TApiUsageError() << "\"variant_field_name\" requires (NYT.oneof_flags) = VARIANT"; + } + break; + case EProtobufOneofMode::Variant: + if (variantFieldName) { + options.VariantFieldName = variantFieldName; + } else { + options.VariantFieldName = oneofDescriptor->name(); + } + break; + } + return options; +} + + +TProtobufMessageOptions GetMessageOptions(const Descriptor* descriptor) +{ + TProtobufMessageOptions options; + ParseProtobufMessageOptions( + descriptor->file()->options().GetRepeatedExtension(file_default_message_flags), + &options); + ParseProtobufMessageOptions( + descriptor->options().GetRepeatedExtension(message_flags), + &options); + return options; +} + +TNode MakeEnumerationConfig(const ::google::protobuf::EnumDescriptor* enumDescriptor) +{ + auto config = TNode::CreateMap(); + for (int i = 0; i < enumDescriptor->value_count(); ++i) { + config[enumDescriptor->value(i)->name()] = enumDescriptor->value(i)->number(); + } + return config; +} + +TString DeduceProtobufType( + const FieldDescriptor* fieldDescriptor, + const TProtobufFieldOptions& options) +{ + if (options.Type) { + ValidateProtobufType(*fieldDescriptor, *options.Type); + return ToString(*options.Type); + } + switch (fieldDescriptor->type()) { + case FieldDescriptor::TYPE_ENUM: + return ToString(EProtobufType::EnumString); + case FieldDescriptor::TYPE_MESSAGE: + switch (options.SerializationMode) { + case EProtobufSerializationMode::Protobuf: + return "message"; + case EProtobufSerializationMode::Yt: + return "structured_message"; + case EProtobufSerializationMode::Embedded: + return "embedded_message"; + } + Y_FAIL(); + default: + return fieldDescriptor->type_name(); + } + Y_FAIL(); +} + +TString GetColumnName(const ::google::protobuf::FieldDescriptor& field) +{ + const auto& options = field.options(); + const auto columnName = options.GetExtension(column_name); + if (!columnName.empty()) { + return columnName; + } + const auto keyColumnName = options.GetExtension(key_column_name); + if (!keyColumnName.empty()) { + return keyColumnName; + } + return field.name(); +} + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + TCycleChecker& cycleChecker); + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + TCycleChecker& cycleChecker); + +TNode MakeMapFieldsConfig( + const FieldDescriptor* fieldDescriptor, + TNode* enumerations, + const TProtobufFieldOptions& fieldOptions, + TCycleChecker& cycleChecker) +{ + Y_VERIFY(fieldDescriptor->is_map()); + auto message = fieldDescriptor->message_type(); + switch (fieldOptions.MapMode) { + case EProtobufMapMode::ListOfStructsLegacy: + return MakeProtoFormatMessageFieldsConfig( + message, + enumerations, + cycleChecker); + case EProtobufMapMode::ListOfStructs: + case EProtobufMapMode::Dict: + case EProtobufMapMode::OptionalDict: { + TProtobufFieldOptions defaultFieldOptions; + defaultFieldOptions.SerializationMode = EProtobufSerializationMode::Yt; + return MakeProtoFormatMessageFieldsConfig( + message, + enumerations, + defaultFieldOptions, + TProtobufOneofOptions{}, + cycleChecker); + } + } + Y_FAIL(); +} + +TNode MakeProtoFormatFieldConfig( + const FieldDescriptor* fieldDescriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultOptions, + TCycleChecker& cycleChecker) +{ + auto fieldConfig = TNode::CreateMap(); + fieldConfig["field_number"] = fieldDescriptor->number(); + fieldConfig["name"] = GetColumnName(*fieldDescriptor); + + auto fieldOptions = GetFieldOptions(fieldDescriptor, defaultOptions); + + Y_ENSURE(fieldOptions.SerializationMode != EProtobufSerializationMode::Embedded, + "EMBEDDED flag is currently supported only with " + "ProtobufFormatWithDescriptors config option set to true"); + + if (fieldDescriptor->is_repeated()) { + Y_ENSURE_EX(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt, + TApiUsageError() << "Repeated field \"" << fieldDescriptor->full_name() << "\" " << + "must have flag \"" << EWrapperFieldFlag::SERIALIZATION_YT << "\""); + } + fieldConfig["repeated"] = fieldDescriptor->is_repeated(); + fieldConfig["packed"] = fieldDescriptor->is_packed(); + + fieldConfig["proto_type"] = DeduceProtobufType(fieldDescriptor, fieldOptions); + + if (fieldDescriptor->type() == FieldDescriptor::TYPE_ENUM) { + auto* enumeration = fieldDescriptor->enum_type(); + (*enumerations)[enumeration->full_name()] = MakeEnumerationConfig(enumeration); + fieldConfig["enumeration_name"] = enumeration->full_name(); + } + + if (fieldOptions.SerializationMode != EProtobufSerializationMode::Yt) { + return fieldConfig; + } + + if (fieldDescriptor->is_map()) { + fieldConfig["fields"] = MakeMapFieldsConfig(fieldDescriptor, enumerations, fieldOptions, cycleChecker); + return fieldConfig; + } + + if (fieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE) { + fieldConfig["fields"] = MakeProtoFormatMessageFieldsConfig( + fieldDescriptor->message_type(), + enumerations, + cycleChecker); + } + + return fieldConfig; +} + +void MakeProtoFormatOneofConfig( + const OneofDescriptor* oneofDescriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + TCycleChecker& cycleChecker, + TNode* fields) +{ + auto addFields = [&] (TNode* fields) { + for (int i = 0; i < oneofDescriptor->field_count(); ++i) { + fields->Add(MakeProtoFormatFieldConfig( + oneofDescriptor->field(i), + enumerations, + defaultFieldOptions, + cycleChecker)); + } + }; + + auto oneofOptions = GetOneofOptions(oneofDescriptor, defaultOneofOptions); + switch (oneofOptions.Mode) { + case EProtobufOneofMode::SeparateFields: + addFields(fields); + return; + case EProtobufOneofMode::Variant: { + auto oneofFields = TNode::CreateList(); + addFields(&oneofFields); + auto oneofField = TNode() + ("proto_type", "oneof") + ("name", oneofOptions.VariantFieldName) + ("fields", std::move(oneofFields)); + fields->Add(std::move(oneofField)); + return; + } + } + Y_FAIL(); +} + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + TCycleChecker& cycleChecker) +{ + auto fields = TNode::CreateList(); + THashSet<const OneofDescriptor*> visitedOneofs; + auto guard = cycleChecker.Enter(descriptor); + for (int fieldIndex = 0; fieldIndex < descriptor->field_count(); ++fieldIndex) { + auto fieldDescriptor = descriptor->field(fieldIndex); + auto oneofDescriptor = fieldDescriptor->containing_oneof(); + if (!oneofDescriptor) { + fields.Add(MakeProtoFormatFieldConfig( + fieldDescriptor, + enumerations, + defaultFieldOptions, + cycleChecker)); + } else if (!visitedOneofs.contains(oneofDescriptor)) { + MakeProtoFormatOneofConfig( + oneofDescriptor, + enumerations, + defaultFieldOptions, + defaultOneofOptions, + cycleChecker, + &fields); + visitedOneofs.insert(oneofDescriptor); + } + } + return fields; +} + +TNode MakeProtoFormatMessageFieldsConfig( + const Descriptor* descriptor, + TNode* enumerations, + TCycleChecker& cycleChecker) +{ + return MakeProtoFormatMessageFieldsConfig( + descriptor, + enumerations, + GetDefaultFieldOptions(descriptor), + GetDefaultOneofOptions(descriptor), + cycleChecker); +} + +TNode MakeProtoFormatConfigWithTables(const TVector<const Descriptor*>& descriptors) +{ + TNode config("protobuf"); + config.Attributes() + ("enumerations", TNode::CreateMap()) + ("tables", TNode::CreateList()); + + auto& enumerations = config.Attributes()["enumerations"]; + + for (auto* descriptor : descriptors) { + TCycleChecker cycleChecker; + auto columns = MakeProtoFormatMessageFieldsConfig(descriptor, &enumerations, cycleChecker); + config.Attributes()["tables"].Add( + TNode()("columns", std::move(columns))); + } + + return config; +} + +//////////////////////////////////////////////////////////////////////////////// + +class TFileDescriptorSetBuilder +{ +public: + TFileDescriptorSetBuilder() + : ExtensionFile_(EWrapperFieldFlag::descriptor()->file()) + { } + + void AddDescriptor(const Descriptor* descriptor) + { + auto [it, inserted] = AllDescriptors_.insert(descriptor); + if (!inserted) { + return; + } + + const auto* containingType = descriptor->containing_type(); + while (containingType) { + AddDescriptor(containingType); + containingType = containingType->containing_type(); + } + for (int i = 0; i < descriptor->field_count(); ++i) { + AddField(descriptor->field(i)); + } + } + + FileDescriptorSet Build() + { + THashSet<const FileDescriptor*> visitedFiles; + TVector<const FileDescriptor*> fileTopoOrder; + for (const auto* descriptor : AllDescriptors_) { + TraverseDependencies(descriptor->file(), visitedFiles, fileTopoOrder); + } + + THashSet<TString> messageTypeNames; + THashSet<TString> enumTypeNames; + for (const auto* descriptor : AllDescriptors_) { + messageTypeNames.insert(descriptor->full_name()); + } + for (const auto* enumDescriptor : EnumDescriptors_) { + enumTypeNames.insert(enumDescriptor->full_name()); + } + FileDescriptorSet fileDescriptorSetProto; + for (const auto* file : fileTopoOrder) { + auto* fileProto = fileDescriptorSetProto.add_file(); + file->CopyTo(fileProto); + Strip(fileProto, messageTypeNames, enumTypeNames); + } + return fileDescriptorSetProto; + } + +private: + void AddField(const FieldDescriptor* fieldDescriptor) + { + if (fieldDescriptor->message_type()) { + AddDescriptor(fieldDescriptor->message_type()); + } + if (fieldDescriptor->enum_type()) { + AddEnumDescriptor(fieldDescriptor->enum_type()); + } + } + + void AddEnumDescriptor(const EnumDescriptor* enumDescriptor) + { + auto [it, inserted] = EnumDescriptors_.insert(enumDescriptor); + if (!inserted) { + return; + } + const auto* containingType = enumDescriptor->containing_type(); + while (containingType) { + AddDescriptor(containingType); + containingType = containingType->containing_type(); + } + } + + void TraverseDependencies( + const FileDescriptor* current, + THashSet<const FileDescriptor*>& visited, + TVector<const FileDescriptor*>& topoOrder) + { + auto [it, inserted] = visited.insert(current); + if (!inserted) { + return; + } + for (int i = 0; i < current->dependency_count(); ++i) { + TraverseDependencies(current->dependency(i), visited, topoOrder); + } + topoOrder.push_back(current); + } + + template <typename TOptions> + void StripUnknownOptions(TOptions* options) + { + std::vector<const FieldDescriptor*> fields; + auto reflection = options->GetReflection(); + reflection->ListFields(*options, &fields); + for (auto field : fields) { + if (field->is_extension() && field->file() != ExtensionFile_) { + reflection->ClearField(options, field); + } + } + } + + template <typename TRepeatedField, typename TPredicate> + void RemoveIf(TRepeatedField* repeatedField, TPredicate predicate) + { + repeatedField->erase( + std::remove_if(repeatedField->begin(), repeatedField->end(), predicate), + repeatedField->end()); + } + + void Strip( + const TString& containingTypePrefix, + DescriptorProto* messageProto, + const THashSet<TString>& messageTypeNames, + const THashSet<TString>& enumTypeNames) + { + const auto prefix = containingTypePrefix + messageProto->name() + '.'; + + RemoveIf(messageProto->mutable_nested_type(), [&] (const DescriptorProto& descriptorProto) { + return !messageTypeNames.contains(prefix + descriptorProto.name()); + }); + RemoveIf(messageProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) { + return !enumTypeNames.contains(prefix + enumDescriptorProto.name()); + }); + + messageProto->clear_extension(); + StripUnknownOptions(messageProto->mutable_options()); + for (auto& fieldProto : *messageProto->mutable_field()) { + StripUnknownOptions(fieldProto.mutable_options()); + } + for (auto& oneofProto : *messageProto->mutable_oneof_decl()) { + StripUnknownOptions(oneofProto.mutable_options()); + } + for (auto& nestedTypeProto : *messageProto->mutable_nested_type()) { + Strip(prefix, &nestedTypeProto, messageTypeNames, enumTypeNames); + } + for (auto& enumProto : *messageProto->mutable_enum_type()) { + StripUnknownOptions(enumProto.mutable_options()); + for (auto& enumValue : *enumProto.mutable_value()) { + StripUnknownOptions(enumValue.mutable_options()); + } + } + } + + void Strip( + FileDescriptorProto* fileProto, + const THashSet<TString>& messageTypeNames, + const THashSet<TString>& enumTypeNames) + { + const auto prefix = fileProto->package().Empty() + ? "" + : fileProto->package() + '.'; + + RemoveIf(fileProto->mutable_message_type(), [&] (const DescriptorProto& descriptorProto) { + return !messageTypeNames.contains(prefix + descriptorProto.name()); + }); + RemoveIf(fileProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) { + return !enumTypeNames.contains(prefix + enumDescriptorProto.name()); + }); + + fileProto->clear_service(); + fileProto->clear_extension(); + + StripUnknownOptions(fileProto->mutable_options()); + for (auto& messageProto : *fileProto->mutable_message_type()) { + Strip(prefix, &messageProto, messageTypeNames, enumTypeNames); + } + for (auto& enumProto : *fileProto->mutable_enum_type()) { + StripUnknownOptions(enumProto.mutable_options()); + for (auto& enumValue : *enumProto.mutable_value()) { + StripUnknownOptions(enumValue.mutable_options()); + } + } + } + +private: + const FileDescriptor* const ExtensionFile_; + THashSet<const Descriptor*> AllDescriptors_; + THashSet<const EnumDescriptor*> EnumDescriptors_; +}; + +TNode MakeProtoFormatConfigWithDescriptors(const TVector<const Descriptor*>& descriptors) +{ + TFileDescriptorSetBuilder builder; + auto typeNames = TNode::CreateList(); + for (const auto* descriptor : descriptors) { + builder.AddDescriptor(descriptor); + typeNames.Add(descriptor->full_name()); + } + + auto fileDescriptorSetText = builder.Build().ShortDebugString(); + TNode config("protobuf"); + config.Attributes() + ("file_descriptor_set_text", std::move(fileDescriptorSetText)) + ("type_names", std::move(typeNames)); + return config; +} + +//////////////////////////////////////////////////////////////////////////////// + +using TTypePtrOrOtherColumns = std::variant<NTi::TTypePtr, TOtherColumns>; + +struct TMember { + TString Name; + TTypePtrOrOtherColumns TypeOrOtherColumns; +}; + +//////////////////////////////////////////////////////////////////////////////// + +TValueTypeOrOtherColumns GetScalarFieldType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& options) +{ + if (options.Type) { + switch (*options.Type) { + case EProtobufType::EnumInt: + return EValueType::VT_INT64; + case EProtobufType::EnumString: + return EValueType::VT_STRING; + case EProtobufType::Any: + return EValueType::VT_ANY; + case EProtobufType::OtherColumns: + return TOtherColumns{}; + } + Y_FAIL(); + } + + switch (fieldDescriptor.cpp_type()) { + case FieldDescriptor::CPPTYPE_INT32: + return EValueType::VT_INT32; + case FieldDescriptor::CPPTYPE_INT64: + return EValueType::VT_INT64; + case FieldDescriptor::CPPTYPE_UINT32: + return EValueType::VT_UINT32; + case FieldDescriptor::CPPTYPE_UINT64: + return EValueType::VT_UINT64; + case FieldDescriptor::CPPTYPE_FLOAT: + case FieldDescriptor::CPPTYPE_DOUBLE: + return EValueType::VT_DOUBLE; + case FieldDescriptor::CPPTYPE_BOOL: + return EValueType::VT_BOOLEAN; + case FieldDescriptor::CPPTYPE_STRING: + case FieldDescriptor::CPPTYPE_MESSAGE: + case FieldDescriptor::CPPTYPE_ENUM: + return EValueType::VT_STRING; + default: + ythrow yexception() << + "Unexpected field type '" << fieldDescriptor.cpp_type_name() << "' " << + "for field " << fieldDescriptor.name(); + } +} + +bool HasNameExtension(const FieldDescriptor& fieldDescriptor) +{ + const auto& options = fieldDescriptor.options(); + return options.HasExtension(column_name) || options.HasExtension(key_column_name); +} + +void SortFields(TVector<const FieldDescriptor*>& fieldDescriptors, EProtobufFieldSortOrder fieldSortOrder) +{ + switch (fieldSortOrder) { + case EProtobufFieldSortOrder::AsInProtoFile: + return; + case EProtobufFieldSortOrder::ByFieldNumber: + SortBy(fieldDescriptors, [] (const FieldDescriptor* fieldDescriptor) { + return fieldDescriptor->number(); + }); + return; + } + Y_FAIL(); +} + +NTi::TTypePtr CreateStruct(TStringBuf fieldName, TVector<TMember> members) +{ + TVector<NTi::TStructType::TOwnedMember> structMembers; + structMembers.reserve(members.size()); + for (auto& member : members) { + std::visit(TOverloaded{ + [&] (TOtherColumns) { + ythrow TApiUsageError() << + "Could not deduce YT type for field " << member.Name << " of " << + "embedded message field " << fieldName << " " << + "(note that " << EWrapperFieldFlag::OTHER_COLUMNS << " fields " << + "are not allowed inside embedded messages)"; + }, + [&] (NTi::TTypePtr& type) { + structMembers.emplace_back(std::move(member.Name), std::move(type)); + }, + }, member.TypeOrOtherColumns); + } + return NTi::Struct(std::move(structMembers)); +} + +TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor) +{ + auto isOtherColumns = [] (const ::google::protobuf::FieldDescriptor& field) { + return GetFieldOptions(&field).Type == EProtobufType::OtherColumns; + }; + + TVector<TString> result; + result.reserve(descriptor.field_count()); + for (int i = 0; i < descriptor.field_count(); ++i) { + const auto& field = *descriptor.field(i); + if (isOtherColumns(field)) { + return {}; + } + result.push_back(GetColumnName(field)); + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +class TTableSchemaInferrer +{ +public: + TTableSchemaInferrer(bool keepFieldsWithoutExtension) + : KeepFieldsWithoutExtension_(keepFieldsWithoutExtension) + { } + + TTableSchema InferSchema(const Descriptor& messageDescriptor); + +private: + TTypePtrOrOtherColumns GetFieldType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& defaultOptions); + + void ProcessOneofField( + TStringBuf containingFieldName, + const OneofDescriptor& oneofDescriptor, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + EProtobufFieldSortOrder fieldSortOrder, + TVector<TMember>* members); + + TVector<TMember> GetMessageMembers( + TStringBuf containingFieldName, + const Descriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder = std::nullopt); + + NTi::TTypePtr GetMessageType( + const FieldDescriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions); + + NTi::TTypePtr GetMapType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& fieldOptions); + +private: + void GetMessageMembersImpl( + TStringBuf containingFieldName, + const Descriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder, + TVector<TMember>* members); + +private: + const bool KeepFieldsWithoutExtension_; + TCycleChecker CycleChecker_; +}; + +void TTableSchemaInferrer::ProcessOneofField( + TStringBuf containingFieldName, + const OneofDescriptor& oneofDescriptor, + const TProtobufFieldOptions& defaultFieldOptions, + const TProtobufOneofOptions& defaultOneofOptions, + EProtobufFieldSortOrder fieldSortOrder, + TVector<TMember>* members) +{ + auto oneofOptions = GetOneofOptions(&oneofDescriptor, defaultOneofOptions); + + auto addFields = [&] (TVector<TMember>* members, bool removeOptionality) { + TVector<const FieldDescriptor*> fieldDescriptors; + for (int i = 0; i < oneofDescriptor.field_count(); ++i) { + fieldDescriptors.push_back(oneofDescriptor.field(i)); + } + SortFields(fieldDescriptors, fieldSortOrder); + for (auto innerFieldDescriptor : fieldDescriptors) { + auto typeOrOtherColumns = GetFieldType( + *innerFieldDescriptor, + defaultFieldOptions); + if (auto* maybeType = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns); + maybeType && removeOptionality && (*maybeType)->IsOptional()) + { + typeOrOtherColumns = (*maybeType)->AsOptional()->GetItemType(); + } + members->push_back(TMember{ + GetColumnName(*innerFieldDescriptor), + std::move(typeOrOtherColumns), + }); + } + }; + + switch (oneofOptions.Mode) { + case EProtobufOneofMode::SeparateFields: + addFields(members, /* removeOptionality */ false); + return; + case EProtobufOneofMode::Variant: { + TVector<TMember> variantMembers; + addFields(&variantMembers, /* removeOptionality */ true); + members->push_back(TMember{ + oneofOptions.VariantFieldName, + NTi::Optional( + NTi::Variant( + CreateStruct(containingFieldName, std::move(variantMembers)) + ) + ) + }); + return; + } + } + Y_FAIL(); +} + +TVector<TMember> TTableSchemaInferrer::GetMessageMembers( + TStringBuf containingFieldName, + const Descriptor& messageDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder) +{ + TVector<TMember> members; + GetMessageMembersImpl( + containingFieldName, + messageDescriptor, + defaultFieldOptions, + overrideFieldSortOrder, + &members + ); + return members; +} + +void TTableSchemaInferrer::GetMessageMembersImpl( + TStringBuf containingFieldName, + const Descriptor& messageDescriptor, + TProtobufFieldOptions defaultFieldOptions, + std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder, + TVector<TMember>* members) +{ + auto guard = CycleChecker_.Enter(&messageDescriptor); + defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor, defaultFieldOptions); + auto messageOptions = GetMessageOptions(&messageDescriptor); + auto defaultOneofOptions = GetDefaultOneofOptions(&messageDescriptor); + + TVector<const FieldDescriptor*> fieldDescriptors; + fieldDescriptors.reserve(messageDescriptor.field_count()); + for (int i = 0; i < messageDescriptor.field_count(); ++i) { + if (!KeepFieldsWithoutExtension_ && !HasNameExtension(*messageDescriptor.field(i))) { + continue; + } + fieldDescriptors.push_back(messageDescriptor.field(i)); + } + + auto fieldSortOrder = overrideFieldSortOrder.value_or(messageOptions.FieldSortOrder); + SortFields(fieldDescriptors, fieldSortOrder); + + THashSet<const OneofDescriptor*> visitedOneofs; + for (const auto innerFieldDescriptor : fieldDescriptors) { + auto oneofDescriptor = innerFieldDescriptor->containing_oneof(); + if (oneofDescriptor) { + if (visitedOneofs.contains(oneofDescriptor)) { + continue; + } + ProcessOneofField( + containingFieldName, + *oneofDescriptor, + defaultFieldOptions, + defaultOneofOptions, + messageOptions.FieldSortOrder, + members); + visitedOneofs.insert(oneofDescriptor); + continue; + } + auto fieldOptions = GetFieldOptions(innerFieldDescriptor, defaultFieldOptions); + if (fieldOptions.SerializationMode == EProtobufSerializationMode::Embedded) { + Y_ENSURE(innerFieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE, + "EMBEDDED column must have message type"); + Y_ENSURE(innerFieldDescriptor->label() == FieldDescriptor::LABEL_REQUIRED, + "EMBEDDED column must be marked required"); + GetMessageMembersImpl( + innerFieldDescriptor->full_name(), + *innerFieldDescriptor->message_type(), + defaultFieldOptions, + /*overrideFieldSortOrder*/ std::nullopt, + members); + } else { + auto typeOrOtherColumns = GetFieldType( + *innerFieldDescriptor, + defaultFieldOptions); + members->push_back(TMember{ + GetColumnName(*innerFieldDescriptor), + std::move(typeOrOtherColumns), + }); + } + } +} + +NTi::TTypePtr TTableSchemaInferrer::GetMessageType( + const FieldDescriptor& fieldDescriptor, + TProtobufFieldOptions defaultFieldOptions) +{ + Y_VERIFY(fieldDescriptor.message_type()); + const auto& messageDescriptor = *fieldDescriptor.message_type(); + auto members = GetMessageMembers( + fieldDescriptor.full_name(), + messageDescriptor, + defaultFieldOptions); + + return CreateStruct(fieldDescriptor.full_name(), std::move(members)); +} + +NTi::TTypePtr TTableSchemaInferrer::GetMapType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& fieldOptions) +{ + Y_VERIFY(fieldDescriptor.is_map()); + switch (fieldOptions.MapMode) { + case EProtobufMapMode::ListOfStructsLegacy: + case EProtobufMapMode::ListOfStructs: { + TProtobufFieldOptions embeddedOptions; + if (fieldOptions.MapMode == EProtobufMapMode::ListOfStructs) { + embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt; + } + auto list = NTi::List(GetMessageType(fieldDescriptor, embeddedOptions)); + switch (fieldOptions.ListMode) { + case EProtobufListMode::Required: + return list; + case EProtobufListMode::Optional: + return NTi::Optional(std::move(list)); + } + Y_FAIL(); + } + case EProtobufMapMode::Dict: + case EProtobufMapMode::OptionalDict: { + auto message = fieldDescriptor.message_type(); + Y_VERIFY(message->field_count() == 2); + auto keyVariant = GetScalarFieldType(*message->field(0), TProtobufFieldOptions{}); + Y_VERIFY(std::holds_alternative<EValueType>(keyVariant)); + auto key = std::get<EValueType>(keyVariant); + TProtobufFieldOptions embeddedOptions; + embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt; + auto valueVariant = GetFieldType(*message->field(1), embeddedOptions); + Y_VERIFY(std::holds_alternative<NTi::TTypePtr>(valueVariant)); + auto value = std::get<NTi::TTypePtr>(valueVariant); + Y_VERIFY(value->IsOptional()); + value = value->AsOptional()->GetItemType(); + auto dict = NTi::Dict(ToTypeV3(key, true), value); + if (fieldOptions.MapMode == EProtobufMapMode::OptionalDict) { + return NTi::Optional(dict); + } else { + return dict; + } + } + } +} + +TTypePtrOrOtherColumns TTableSchemaInferrer::GetFieldType( + const FieldDescriptor& fieldDescriptor, + const TProtobufFieldOptions& defaultOptions) +{ + auto fieldOptions = GetFieldOptions(&fieldDescriptor, defaultOptions); + if (fieldOptions.Type) { + ValidateProtobufType(fieldDescriptor, *fieldOptions.Type); + } + + auto getScalarType = [&] { + auto valueTypeOrOtherColumns = GetScalarFieldType(fieldDescriptor, fieldOptions); + return std::visit(TOverloaded{ + [] (TOtherColumns) -> TTypePtrOrOtherColumns { + return TOtherColumns{}; + }, + [] (EValueType valueType) -> TTypePtrOrOtherColumns { + return ToTypeV3(valueType, true); + } + }, valueTypeOrOtherColumns); + }; + + auto withFieldLabel = [&] (const TTypePtrOrOtherColumns& typeOrOtherColumns) -> TTypePtrOrOtherColumns { + switch (fieldDescriptor.label()) { + case FieldDescriptor::Label::LABEL_REPEATED: { + Y_ENSURE(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt, + "Repeated fields are supported only for YT serialization mode, field \"" + fieldDescriptor.full_name() + + "\" has incorrect serialization mode"); + auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns); + Y_ENSURE(type, "OTHER_COLUMNS field can not be repeated"); + switch (fieldOptions.ListMode) { + case EProtobufListMode::Required: + return NTi::TTypePtr(NTi::List(*type)); + case EProtobufListMode::Optional: + return NTi::TTypePtr(NTi::Optional(NTi::List(*type))); + } + Y_FAIL(); + } + case FieldDescriptor::Label::LABEL_OPTIONAL: + return std::visit(TOverloaded{ + [] (TOtherColumns) -> TTypePtrOrOtherColumns { + return TOtherColumns{}; + }, + [] (NTi::TTypePtr type) -> TTypePtrOrOtherColumns { + return NTi::TTypePtr(NTi::Optional(std::move(type))); + } + }, typeOrOtherColumns); + case FieldDescriptor::LABEL_REQUIRED: { + auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns); + Y_ENSURE(type, "OTHER_COLUMNS field can not be required"); + return *type; + } + } + Y_FAIL(); + }; + + switch (fieldOptions.SerializationMode) { + case EProtobufSerializationMode::Protobuf: + return withFieldLabel(getScalarType()); + case EProtobufSerializationMode::Yt: + if (fieldDescriptor.type() == FieldDescriptor::TYPE_MESSAGE) { + if (fieldDescriptor.is_map()) { + return GetMapType(fieldDescriptor, fieldOptions); + } else { + return withFieldLabel(GetMessageType(fieldDescriptor, TProtobufFieldOptions{})); + } + } else { + return withFieldLabel(getScalarType()); + } + case EProtobufSerializationMode::Embedded: + ythrow yexception() << "EMBEDDED field is not allowed for field " + << fieldDescriptor.full_name(); + } + Y_FAIL(); +} + +TTableSchema TTableSchemaInferrer::InferSchema(const Descriptor& messageDescriptor) +{ + TTableSchema result; + + auto defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor); + auto members = GetMessageMembers( + messageDescriptor.full_name(), + messageDescriptor, + defaultFieldOptions, + // Use special sort order for top level messages. + /*overrideFieldSortOrder*/ EProtobufFieldSortOrder::AsInProtoFile); + + for (auto& member : members) { + std::visit(TOverloaded{ + [&] (TOtherColumns) { + result.Strict(false); + }, + [&] (NTi::TTypePtr& type) { + result.AddColumn(TColumnSchema() + .Name(std::move(member.Name)) + .Type(std::move(type)) + ); + }, + }, member.TypeOrOtherColumns); + } + + return result; +} + +TTableSchema CreateTableSchemaImpl( + const Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension) +{ + TTableSchemaInferrer inferrer(keepFieldsWithoutExtension); + return inferrer.InferSchema(messageDescriptor); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail + +//////////////////////////////////////////////////////////////////////////////// + +template <> +void Out<NYT::EWrapperFieldFlag::Enum>(IOutputStream& stream, NYT::EWrapperFieldFlag::Enum value) +{ + stream << NYT::EWrapperFieldFlag_Enum_Name(value); +} + +template <> +void Out<NYT::EWrapperMessageFlag::Enum>(IOutputStream& stream, NYT::EWrapperMessageFlag::Enum value) +{ + stream << NYT::EWrapperMessageFlag_Enum_Name(value); +} + +template <> +void Out<NYT::EWrapperOneofFlag::Enum>(IOutputStream& stream, NYT::EWrapperOneofFlag::Enum value) +{ + stream << NYT::EWrapperOneofFlag_Enum_Name(value); +} diff --git a/yt/cpp/mapreduce/interface/protobuf_format.h b/yt/cpp/mapreduce/interface/protobuf_format.h new file mode 100644 index 0000000000..aafbced386 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_format.h @@ -0,0 +1,106 @@ +#pragma once + +#include "common.h" + +#include <yt/yt_proto/yt/formats/extension.pb.h> + +#include <util/generic/maybe.h> + +#include <google/protobuf/message.h> + +/// @cond Doxygen_Suppress +namespace NYT::NDetail { + +//////////////////////////////////////////////////////////////////////////////// + +enum class EProtobufType +{ + EnumInt /* "enum_int" */, + EnumString /* "enum_string" */, + Any /* "any" */, + OtherColumns /* "other_columns" */, +}; + +enum class EProtobufSerializationMode +{ + Protobuf, + Yt, + Embedded, +}; + +enum class EProtobufListMode +{ + Optional, + Required, +}; + +enum class EProtobufMapMode +{ + ListOfStructsLegacy, + ListOfStructs, + Dict, + OptionalDict, +}; + +enum class EProtobufFieldSortOrder +{ + AsInProtoFile, + ByFieldNumber, +}; + +enum class EProtobufOneofMode +{ + SeparateFields, + Variant, +}; + +enum class EProtobufEnumWritingMode +{ + SkipUnknownValues, + CheckValues, +}; + +struct TProtobufOneofOptions +{ + EProtobufOneofMode Mode = EProtobufOneofMode::Variant; + TString VariantFieldName; +}; + +struct TProtobufFieldOptions +{ + TMaybe<EProtobufType> Type; + EProtobufSerializationMode SerializationMode = EProtobufSerializationMode::Protobuf; + EProtobufListMode ListMode = EProtobufListMode::Required; + EProtobufMapMode MapMode = EProtobufMapMode::ListOfStructsLegacy; +}; + +struct TProtobufMessageOptions +{ + EProtobufFieldSortOrder FieldSortOrder = EProtobufFieldSortOrder::ByFieldNumber; +}; + +TString GetColumnName(const ::google::protobuf::FieldDescriptor& field); + +TProtobufFieldOptions GetFieldOptions( + const ::google::protobuf::FieldDescriptor* fieldDescriptor, + const TMaybe<TProtobufFieldOptions>& defaultFieldOptions = {}); + +TProtobufOneofOptions GetOneofOptions( + const ::google::protobuf::OneofDescriptor* oneofDescriptor, + const TMaybe<TProtobufOneofOptions>& defaultOneofOptions = {}); + +TProtobufMessageOptions GetMessageOptions(const ::google::protobuf::Descriptor* descriptor); + +TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor); + +TNode MakeProtoFormatConfigWithTables(const TVector<const ::google::protobuf::Descriptor*>& descriptors); +TNode MakeProtoFormatConfigWithDescriptors(const TVector<const ::google::protobuf::Descriptor*>& descriptors); + +TTableSchema CreateTableSchemaImpl( + const ::google::protobuf::Descriptor& messageDescriptor, + bool keepFieldsWithoutExtension); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NDetail +/// @endcond diff --git a/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp new file mode 100644 index 0000000000..19a3d5163f --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.cpp @@ -0,0 +1,451 @@ +#include "common.h" +#include "errors.h" +#include "common_ut.h" +#include "util/generic/fwd.h" + +#include <yt/cpp/mapreduce/interface/protobuf_table_schema_ut.pb.h> +#include <yt/cpp/mapreduce/interface/proto3_ut.pb.h> + +#include <yt/cpp/mapreduce/tests/yt_unittest_lib/yt_unittest_lib.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <algorithm> + +using namespace NYT; + +bool IsFieldPresent(const TTableSchema& schema, TStringBuf name) +{ + for (const auto& field : schema.Columns()) { + if (field.Name() == name) { + return true; + } + } + return false; +} + +Y_UNIT_TEST_SUITE(ProtoSchemaTest_Simple) +{ + Y_UNIT_TEST(TIntegral) + { + const auto schema = CreateTableSchema<NUnitTesting::TIntegral>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("DoubleField").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("FloatField").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("Int32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Int64Field").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("Uint32Field").Type(ToTypeV3(EValueType::VT_UINT32, false))) + .AddColumn(TColumnSchema().Name("Uint64Field").Type(ToTypeV3(EValueType::VT_UINT64, false))) + .AddColumn(TColumnSchema().Name("Sint32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Sint64Field").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("Fixed32Field").Type(ToTypeV3(EValueType::VT_UINT32, false))) + .AddColumn(TColumnSchema().Name("Fixed64Field").Type(ToTypeV3(EValueType::VT_UINT64, false))) + .AddColumn(TColumnSchema().Name("Sfixed32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Sfixed64Field").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("BoolField").Type(ToTypeV3(EValueType::VT_BOOLEAN, false))) + .AddColumn(TColumnSchema().Name("EnumField").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TOneOf) + { + const auto schema = CreateTableSchema<NUnitTesting::TOneOf>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("DoubleField").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("Int32Field").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("BoolField").Type(ToTypeV3(EValueType::VT_BOOLEAN, false)))); + } + + Y_UNIT_TEST(TWithRequired) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithRequired>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("RequiredField").Type(ToTypeV3(EValueType::VT_STRING, true))) + .AddColumn(TColumnSchema().Name("NotRequiredField").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TAggregated) + { + const auto schema = CreateTableSchema<NUnitTesting::TAggregated>(); + + UNIT_ASSERT_VALUES_EQUAL(6, schema.Columns().size()); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("StringField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("BytesField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedRepeatedField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedOneOfField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("NestedRecursiveField").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TAliased) + { + const auto schema = CreateTableSchema<NUnitTesting::TAliased>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("key").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("subkey").Type(ToTypeV3(EValueType::VT_DOUBLE, false))) + .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(SortColumns) + { + const TSortColumns keys = {"key", "subkey"}; + + const auto schema = CreateTableSchema<NUnitTesting::TAliased>(keys); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("key") + .Type(ToTypeV3(EValueType::VT_INT32, false)) + .SortOrder(ESortOrder::SO_ASCENDING)) + .AddColumn(TColumnSchema() + .Name("subkey") + .Type(ToTypeV3(EValueType::VT_DOUBLE, false)) + .SortOrder(ESortOrder::SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(SortColumnsReordered) + { + const TSortColumns keys = {"subkey"}; + + const auto schema = CreateTableSchema<NUnitTesting::TAliased>(keys); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("subkey") + .Type(ToTypeV3(EValueType::VT_DOUBLE, false)) + .SortOrder(ESortOrder::SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("key").Type(ToTypeV3(EValueType::VT_INT32, false))) + .AddColumn(TColumnSchema().Name("Data").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(SortColumnsInvalid) + { + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TAliased>({"subkey", "subkey"}), yexception); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TAliased>({"key", "junk"}), yexception); + } + + Y_UNIT_TEST(KeepFieldsWithoutExtensionTrue) + { + const auto schema = CreateTableSchema<NUnitTesting::TAliased>({}, true); + UNIT_ASSERT(IsFieldPresent(schema, "key")); + UNIT_ASSERT(IsFieldPresent(schema, "subkey")); + UNIT_ASSERT(IsFieldPresent(schema, "Data")); + UNIT_ASSERT(schema.Strict()); + } + + Y_UNIT_TEST(KeepFieldsWithoutExtensionFalse) + { + const auto schema = CreateTableSchema<NUnitTesting::TAliased>({}, false); + UNIT_ASSERT(IsFieldPresent(schema, "key")); + UNIT_ASSERT(IsFieldPresent(schema, "subkey")); + UNIT_ASSERT(!IsFieldPresent(schema, "Data")); + UNIT_ASSERT(schema.Strict()); + } + + Y_UNIT_TEST(ProtobufTypeOption) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithTypeOptions>({}); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .Strict(false) + .AddColumn(TColumnSchema().Name("ColorIntField").Type(ToTypeV3(EValueType::VT_INT64, false))) + .AddColumn(TColumnSchema().Name("ColorStringField").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("AnyField").Type(ToTypeV3(EValueType::VT_ANY, false))) + .AddColumn(TColumnSchema().Name("EmbeddedField").Type( + NTi::Optional(NTi::Struct({ + {"ColorIntField", ToTypeV3(EValueType::VT_INT64, false)}, + {"ColorStringField", ToTypeV3(EValueType::VT_STRING, false)}, + {"AnyField", ToTypeV3(EValueType::VT_ANY, false)}})))) + .AddColumn(TColumnSchema().Name("RepeatedEnumIntField").Type(NTi::List(NTi::Int64())))); + } + + Y_UNIT_TEST(ProtobufTypeOption_TypeMismatch) + { + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_EnumInt>({}), + yexception); + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_EnumString>({}), + yexception); + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_Any>({}), + yexception); + UNIT_ASSERT_EXCEPTION( + CreateTableSchema<NUnitTesting::TWithTypeOptions_TypeMismatch_OtherColumns>({}), + yexception); + } +} + +Y_UNIT_TEST_SUITE(ProtoSchemaTest_Complex) +{ + Y_UNIT_TEST(TRepeated) + { + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TRepeated>(), yexception); + + const auto schema = CreateTableSchema<NUnitTesting::TRepeatedYtMode>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("Int32Field").Type(NTi::List(ToTypeV3(EValueType::VT_INT32, true))))); + } + + Y_UNIT_TEST(TRepeatedOptionalList) + { + const auto schema = CreateTableSchema<NUnitTesting::TOptionalList>(); + auto type = NTi::Optional(NTi::List(NTi::Int64())); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("OptionalListInt64").TypeV3(type))); + } + + NTi::TTypePtr GetUrlRowType(bool required) + { + static const NTi::TTypePtr structType = NTi::Struct({ + {"Host", ToTypeV3(EValueType::VT_STRING, false)}, + {"Path", ToTypeV3(EValueType::VT_STRING, false)}, + {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}}); + return required ? structType : NTi::TTypePtr(NTi::Optional(structType)); + } + + Y_UNIT_TEST(TRowFieldSerializationOption) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowFieldSerializationOption>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(TRowMessageSerializationOption) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowMessageSerializationOption>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(GetUrlRowType(false)))); + } + + Y_UNIT_TEST(TRowMixedSerializationOptions) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowMixedSerializationOptions>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + NTi::TTypePtr GetUrlRowType_ColumnNames(bool required) + { + static const NTi::TTypePtr type = NTi::Struct({ + {"Host_ColumnName", ToTypeV3(EValueType::VT_STRING, false)}, + {"Path_KeyColumnName", ToTypeV3(EValueType::VT_STRING, false)}, + {"HttpCode", ToTypeV3(EValueType::VT_INT32, false)}, + }); + return required ? type : NTi::TTypePtr(NTi::Optional(type)); + } + + Y_UNIT_TEST(TRowMixedSerializationOptions_ColumnNames) + { + const auto schema = CreateTableSchema<NUnitTesting::TRowMixedSerializationOptions_ColumnNames>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("UrlRow_1").Type(GetUrlRowType_ColumnNames(false))) + .AddColumn(TColumnSchema().Name("UrlRow_2").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(NoOptionInheritance) + { + auto deepestEmbedded = NTi::Optional(NTi::Struct({{"x", ToTypeV3(EValueType::VT_INT64, false)}})); + + const auto schema = CreateTableSchema<NUnitTesting::TNoOptionInheritance>(); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("EmbeddedYt_YtOption") + .Type(NTi::Optional(NTi::Struct({{"embedded", deepestEmbedded}})))) + .AddColumn(TColumnSchema().Name("EmbeddedYt_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("EmbeddedYt_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema() + .Name("EmbeddedProtobuf_YtOption") + .Type(NTi::Optional(NTi::Struct({{"embedded", ToTypeV3(EValueType::VT_STRING, false)}})))) + .AddColumn(TColumnSchema().Name("EmbeddedProtobuf_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("EmbeddedProtobuf_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema() + .Name("Embedded_YtOption") + .Type(NTi::Optional(NTi::Struct({{"embedded", ToTypeV3(EValueType::VT_STRING, false)}})))) + .AddColumn(TColumnSchema().Name("Embedded_ProtobufOption").Type(ToTypeV3(EValueType::VT_STRING, false))) + .AddColumn(TColumnSchema().Name("Embedded_NoOption").Type(ToTypeV3(EValueType::VT_STRING, false)))); + } + + Y_UNIT_TEST(Cyclic) + { + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TA>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TB>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TC>(), TApiUsageError); + UNIT_ASSERT_EXCEPTION(CreateTableSchema<NUnitTesting::TCyclic::TD>(), TApiUsageError); + + ASSERT_SERIALIZABLES_EQUAL( + TTableSchema().AddColumn( + TColumnSchema().Name("d").TypeV3(NTi::Optional(NTi::String()))), + CreateTableSchema<NUnitTesting::TCyclic::TE>()); + } + + Y_UNIT_TEST(FieldSortOrder) + { + const auto schema = CreateTableSchema<NUnitTesting::TFieldSortOrder>(); + + auto byFieldNumber = NTi::Optional(NTi::Struct({ + {"z", NTi::Optional(NTi::Bool())}, + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + })); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema().Name("EmbeddedDefault").Type(byFieldNumber)) + .AddColumn(TColumnSchema() + .Name("EmbeddedAsInProtoFile") + .Type(NTi::Optional(NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + {"z", NTi::Optional(NTi::Bool())}, + })))) + .AddColumn(TColumnSchema().Name("EmbeddedByFieldNumber").Type(byFieldNumber))); + } + + Y_UNIT_TEST(Map) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithMap>(); + + auto createKeyValueStruct = [] (NTi::TTypePtr key, NTi::TTypePtr value) { + return NTi::List(NTi::Struct({ + {"key", NTi::Optional(key)}, + {"value", NTi::Optional(value)}, + })); + }; + + auto embedded = NTi::Struct({ + {"x", NTi::Optional(NTi::Int64())}, + {"y", NTi::Optional(NTi::String())}, + }); + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("MapDefault") + .Type(createKeyValueStruct(NTi::Int64(), NTi::String()))) + .AddColumn(TColumnSchema() + .Name("MapListOfStructsLegacy") + .Type(createKeyValueStruct(NTi::Int64(), NTi::String()))) + .AddColumn(TColumnSchema() + .Name("MapListOfStructs") + .Type(createKeyValueStruct(NTi::Int64(), embedded))) + .AddColumn(TColumnSchema() + .Name("MapOptionalDict") + .Type(NTi::Optional(NTi::Dict(NTi::Int64(), embedded)))) + .AddColumn(TColumnSchema() + .Name("MapDict") + .Type(NTi::Dict(NTi::Int64(), embedded)))); + } + + Y_UNIT_TEST(Oneof) + { + const auto schema = CreateTableSchema<NUnitTesting::TWithOneof>(); + + auto embedded = NTi::Struct({ + {"Oneof", NTi::Optional(NTi::Variant(NTi::Struct({ + {"x", NTi::Int64()}, + {"y", NTi::String()}, + })))}, + }); + + auto createType = [&] (TString oneof2Name) { + return NTi::Optional(NTi::Struct({ + {"field", NTi::Optional(NTi::String())}, + {oneof2Name, NTi::Optional(NTi::Variant(NTi::Struct({ + {"x2", NTi::Int64()}, + {"y2", NTi::String()}, + {"z2", embedded}, + })))}, + {"y1", NTi::Optional(NTi::String())}, + {"z1", NTi::Optional(embedded)}, + {"x1", NTi::Optional(NTi::Int64())}, + })); + }; + + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("DefaultSeparateFields") + .Type(createType("variant_field_name"))) + .AddColumn(TColumnSchema() + .Name("NoDefault") + .Type(createType("Oneof2"))) + .AddColumn(TColumnSchema() + .Name("SerializationProtobuf") + .Type(NTi::Optional(NTi::Struct({ + {"y1", NTi::Optional(NTi::String())}, + {"x1", NTi::Optional(NTi::Int64())}, + {"z1", NTi::Optional(NTi::String())}, + })))) + .AddColumn(TColumnSchema() + .Name("TopLevelOneof") + .Type( + NTi::Optional( + NTi::Variant(NTi::Struct({ + {"MemberOfTopLevelOneof", NTi::Int64()} + })) + ) + )) + ); + } + + Y_UNIT_TEST(Embedded) + { + const auto schema = CreateTableSchema<NUnitTesting::TEmbeddingMessage>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .Strict(false) + .AddColumn(TColumnSchema().Name("embedded2_num").Type(NTi::Optional(NTi::Uint64()))) + .AddColumn(TColumnSchema().Name("embedded2_struct").Type(NTi::Optional(NTi::Struct({ + {"float1", NTi::Optional(NTi::Double())}, + {"string1", NTi::Optional(NTi::String())}, + })))) + .AddColumn(TColumnSchema().Name("embedded2_repeated").Type(NTi::List(NTi::String()))) + .AddColumn(TColumnSchema().Name("embedded_num").Type(NTi::Optional(NTi::Uint64()))) + .AddColumn(TColumnSchema().Name("embedded_extra_field").Type(NTi::Optional(NTi::String()))) + .AddColumn(TColumnSchema().Name("variant").Type(NTi::Optional(NTi::Variant(NTi::Struct({ + {"str_variant", NTi::String()}, + {"uint_variant", NTi::Uint64()}, + }))))) + .AddColumn(TColumnSchema().Name("num").Type(NTi::Optional(NTi::Uint64()))) + .AddColumn(TColumnSchema().Name("extra_field").Type(NTi::Optional(NTi::String()))) + ); + } +} + +Y_UNIT_TEST_SUITE(ProtoSchemaTest_Proto3) +{ + Y_UNIT_TEST(TWithOptional) + { + const auto schema = CreateTableSchema<NTestingProto3::TWithOptional>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("x").Type(NTi::Optional(NTi::Int64())) + ) + ); + } + + Y_UNIT_TEST(TWithOptionalMessage) + { + const auto schema = CreateTableSchema<NTestingProto3::TWithOptionalMessage>(); + ASSERT_SERIALIZABLES_EQUAL(schema, TTableSchema() + .AddColumn(TColumnSchema() + .Name("x").Type( + NTi::Optional( + NTi::Struct({{"x", NTi::Optional(NTi::Int64())}}) + ) + ) + ) + ); + } +} diff --git a/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto new file mode 100644 index 0000000000..60bad6e650 --- /dev/null +++ b/yt/cpp/mapreduce/interface/protobuf_table_schema_ut.proto @@ -0,0 +1,402 @@ +import "yt/yt_proto/yt/formats/extension.proto"; + +package NYT.NUnitTesting; + +message TIntegral +{ + optional double DoubleField = 1; + optional float FloatField = 2; + optional int32 Int32Field = 3; + optional int64 Int64Field = 4; + optional uint32 Uint32Field = 5; + optional uint64 Uint64Field = 6; + optional sint32 Sint32Field = 7; + optional sint64 Sint64Field = 8; + optional fixed32 Fixed32Field = 9; + optional fixed64 Fixed64Field = 10; + optional sfixed32 Sfixed32Field = 11; + optional sfixed64 Sfixed64Field = 12; + optional bool BoolField = 13; + enum TriBool + { + TRI_FALSE = 0; + TRI_TRUE = 1; + TRI_UNDEF = -1; + } + optional TriBool EnumField = 14; +} + +message TRepeated +{ + repeated int32 Int32Field = 1; +} + +message TRepeatedYtMode +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + repeated int32 Int32Field = 1; +} + +message TWithTypeOptions +{ + enum Color + { + WHITE = 0; + BLUE = 1; + RED = -1; + } + + message TEmbedded + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional Color ColorIntField = 1 [(NYT.flags) = ENUM_INT]; + optional Color ColorStringField = 2 [(NYT.flags) = ENUM_STRING]; + optional bytes AnyField = 3 [(NYT.flags) = ANY]; + } + + optional Color ColorIntField = 1 [(NYT.flags) = ENUM_INT]; + optional Color ColorStringField = 2 [(NYT.flags) = ENUM_STRING]; + optional bytes AnyField = 3 [(NYT.flags) = ANY]; + optional bytes OtherColumnsField = 4 [(NYT.flags) = OTHER_COLUMNS]; + optional TEmbedded EmbeddedField = 5 [(NYT.flags) = SERIALIZATION_YT]; + repeated Color RepeatedEnumIntField = 6 [(NYT.flags) = SERIALIZATION_YT, (NYT.flags) = ENUM_INT]; +} + +message TWithTypeOptions_TypeMismatch_EnumInt +{ + optional int64 EnumField = 1 [(NYT.flags) = ENUM_INT]; +} + +message TWithTypeOptions_TypeMismatch_EnumString +{ + optional string EnumField = 1 [(NYT.flags) = ENUM_STRING]; +} + +message TWithTypeOptions_TypeMismatch_Any +{ + optional string AnyField = 1 [(NYT.flags) = ANY]; +} + +message TWithTypeOptions_TypeMismatch_OtherColumns +{ + optional string OtherColumnsField = 1 [(NYT.flags) = OTHER_COLUMNS]; +} + +message TOneOf +{ + oneof Chooser + { + double DoubleField = 1; + int32 Int32Field = 2; + } + optional bool BoolField = 3; +} + +message TWithRequired +{ + required string RequiredField = 1; + optional string NotRequiredField = 2; +}; + +message TAggregated +{ + optional string StringField = 1; + optional bytes BytesField = 2; + optional TIntegral NestedField = 3; + optional TRepeated NestedRepeatedField = 4; + optional TOneOf NestedOneOfField = 5; + optional TAggregated NestedRecursiveField = 6; +} + +message TAliased +{ + optional int32 Key = 1 [(NYT.key_column_name) = "key"]; + optional double Subkey = 2 [(NYT.key_column_name) = "subkey"]; + optional TAggregated Data = 3; +} + +//////////////////////////////////////////////////////////////////////////////// + +message TUrlRow +{ + optional string Host = 1 [(NYT.column_name) = "Host"]; + optional string Path = 2 [(NYT.column_name) = "Path"]; + optional sint32 HttpCode = 3 [(NYT.column_name) = "HttpCode"]; +} + +message TRowFieldSerializationOption +{ + optional TUrlRow UrlRow_1 = 1 [(NYT.flags) = SERIALIZATION_YT]; + optional TUrlRow UrlRow_2 = 2; +} + +message TRowMessageSerializationOption +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TUrlRow UrlRow_1 = 1; + optional TUrlRow UrlRow_2 = 2; +} + +message TRowMixedSerializationOptions +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TUrlRow UrlRow_1 = 1; + optional TUrlRow UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF]; +} + +message TRowSerializedRepeatedFields +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + repeated int64 Ints = 1; + repeated TUrlRow UrlRows = 2; +} + +message TUrlRowWithColumnNames +{ + optional string Host = 1 [(NYT.column_name) = "Host_ColumnName", (NYT.key_column_name) = "Host_KeyColumnName"]; + optional string Path = 2 [(NYT.key_column_name) = "Path_KeyColumnName"]; + optional sint32 HttpCode = 3; +} + +message TRowMixedSerializationOptions_ColumnNames +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TUrlRowWithColumnNames UrlRow_1 = 1; + optional TUrlRowWithColumnNames UrlRow_2 = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF]; +} + +message TNoOptionInheritance +{ + message TDeepestEmbedded + { + optional int64 x = 1; + } + + message TEmbedded + { + optional TDeepestEmbedded embedded = 1; + } + + message TEmbeddedYt + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional TDeepestEmbedded embedded = 1; + } + + message TEmbeddedProtobuf + { + option (NYT.default_field_flags) = SERIALIZATION_PROTOBUF; + + optional TDeepestEmbedded embedded = 1; + } + + optional TEmbeddedYt EmbeddedYt_YtOption = 1 [(NYT.flags) = SERIALIZATION_YT]; + optional TEmbeddedYt EmbeddedYt_ProtobufOption = 2 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TEmbeddedYt EmbeddedYt_NoOption = 3; + optional TEmbeddedProtobuf EmbeddedProtobuf_YtOption = 4 [(NYT.flags) = SERIALIZATION_YT]; + optional TEmbeddedProtobuf EmbeddedProtobuf_ProtobufOption = 5 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TEmbeddedProtobuf EmbeddedProtobuf_NoOption = 6; + optional TEmbedded Embedded_YtOption = 7 [(NYT.flags) = SERIALIZATION_YT]; + optional TEmbedded Embedded_ProtobufOption = 8 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + optional TEmbedded Embedded_NoOption = 9; +} + +message TOptionalList +{ + repeated int64 OptionalListInt64 = 1 [(NYT.flags) = OPTIONAL_LIST, (NYT.flags) = SERIALIZATION_YT]; +} + +message TPacked +{ + repeated int64 PackedListInt64 = 1 [(NYT.flags) = SERIALIZATION_YT, packed=true]; +} + +message TCyclic +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TA + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + repeated TB b = 1; + optional TC c = 2; + } + + message TB + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TD d = 1; + } + + message TC + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TD d = 1; + } + + message TD + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional TA a = 1; + } + + message TE + { + optional TD d = 1 [(NYT.flags) = SERIALIZATION_PROTOBUF]; + } + + optional TA a = 1; +} + +message TFieldSortOrder +{ + message TEmbeddedDefault { + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedAsInProtoFile { + option (NYT.message_flags) = DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + message TEmbeddedByFieldNumber { + option (NYT.message_flags) = SORT_FIELDS_BY_FIELD_NUMBER; + optional int64 x = 2; + optional string y = 12; + optional bool z = 1; + } + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional TEmbeddedDefault EmbeddedDefault = 1; + optional TEmbeddedAsInProtoFile EmbeddedAsInProtoFile = 2; + optional TEmbeddedByFieldNumber EmbeddedByFieldNumber = 3; +} + +message TWithMap +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TEmbedded { + optional int64 x = 1; + optional string y = 2; + } + + map<int64, TEmbedded> MapDefault = 1; + map<int64, TEmbedded> MapListOfStructsLegacy = 2 [(NYT.flags) = MAP_AS_LIST_OF_STRUCTS_LEGACY]; + map<int64, TEmbedded> MapListOfStructs = 3 [(NYT.flags) = MAP_AS_LIST_OF_STRUCTS]; + map<int64, TEmbedded> MapOptionalDict = 4 [(NYT.flags) = MAP_AS_OPTIONAL_DICT]; + map<int64, TEmbedded> MapDict = 5 [(NYT.flags) = MAP_AS_DICT]; +} + +message TWithOneof +{ + option (NYT.default_field_flags) = SERIALIZATION_YT; + + message TEmbedded + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + oneof Oneof { + int64 x = 1; + string y = 2; + } + } + + message TDefaultSeparateFields + { + option (NYT.default_oneof_flags) = SEPARATE_FIELDS; + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional string field = 1; + + oneof Oneof2 + { + option (NYT.variant_field_name) = "variant_field_name"; + option (NYT.oneof_flags) = VARIANT; + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TNoDefault + { + option (NYT.default_field_flags) = SERIALIZATION_YT; + + optional string field = 1; + + oneof Oneof2 + { + string y2 = 4; + TEmbedded z2 = 6; + int64 x2 = 2; + } + + oneof Oneof1 + { + option (NYT.oneof_flags) = SEPARATE_FIELDS; + int64 x1 = 10; + string y1 = 3; + TEmbedded z1 = 5; + } + } + + message TSerializationProtobuf + { + oneof Oneof + { + int64 x1 = 2; + string y1 = 1; + TEmbedded z1 = 3; + } + } + + optional TDefaultSeparateFields DefaultSeparateFields = 1; + optional TNoDefault NoDefault = 2; + optional TSerializationProtobuf SerializationProtobuf = 3; + + oneof TopLevelOneof + { + int64 MemberOfTopLevelOneof = 4; + } +} + +message TEmbeddedStruct { + optional float float1 = 1; + optional string string1 = 2; +} + +message TEmbedded2Message { + option (NYT.default_field_flags) = SERIALIZATION_YT; + optional uint64 embedded2_num = 10; + optional TEmbeddedStruct embedded2_struct = 17; + repeated string embedded2_repeated = 42; +} + +message TEmbedded1Message { + option (NYT.default_field_flags) = SERIALIZATION_YT; + required TEmbedded2Message t2 = 1 [(NYT.flags) = EMBEDDED]; + oneof variant { + string str_variant = 101; + uint64 uint_variant = 102; + } + optional uint64 embedded_num = 10; // make intensional field_num collision! + optional string embedded_extra_field = 11; +} + +message TEmbeddingMessage { + optional bytes other_columns_field = 15 [(NYT.flags) = OTHER_COLUMNS]; + required TEmbedded1Message t1 = 2 [(NYT.flags) = EMBEDDED]; + optional uint64 num = 12; + optional string extra_field = 13; +} diff --git a/yt/cpp/mapreduce/interface/public.h b/yt/cpp/mapreduce/interface/public.h new file mode 100644 index 0000000000..bdeda78795 --- /dev/null +++ b/yt/cpp/mapreduce/interface/public.h @@ -0,0 +1,10 @@ +#pragma once + +#include <memory> + +namespace NYT::NAuth { + +struct IServiceTicketAuthPtrWrapper; +using IServiceTicketAuthPtrWrapperPtr = std::shared_ptr<IServiceTicketAuthPtrWrapper>; + +} // namespace NYT::NAuth diff --git a/yt/cpp/mapreduce/interface/retry_policy.h b/yt/cpp/mapreduce/interface/retry_policy.h new file mode 100644 index 0000000000..c198839079 --- /dev/null +++ b/yt/cpp/mapreduce/interface/retry_policy.h @@ -0,0 +1,47 @@ +#pragma once + +#include <util/datetime/base.h> +#include <util/generic/ptr.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// A configuration that controls retries of a single request. +struct TRetryConfig +{ + /// + /// @brief How long retries of a single YT request can go on. + /// + /// If this limit is reached while retry count is not yet exceeded @ref TRequestRetriesTimeout exception is thrown. + TDuration RetriesTimeLimit = TDuration::Max(); +}; + +/// The library uses this class to understand how to retry individual requests. +class IRetryConfigProvider + : public virtual TThrRefBase +{ +public: + /// + /// @brief Gets retry policy for single request. + /// + /// CreateRetryConfig is called before ANY request. + /// Returned config controls retries of this request. + /// + /// Must be thread safe since it can be used from different threads + /// to perform internal library requests (e.g. pings). + /// + /// Some methods (e.g. IClient::Map) involve multiple requests to YT and therefore + /// this method will be called several times during execution of single method. + /// + /// If user needs to limit overall retries inside long operation they might create + /// retry policy that knows about overall deadline + /// @ref NYT::TRetryConfig::RetriesTimeLimit taking into account that overall deadline. + /// (E.g. when deadline reached it returns zero limit for retries). + virtual TRetryConfig CreateRetryConfig() = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT + diff --git a/yt/cpp/mapreduce/interface/serialize.cpp b/yt/cpp/mapreduce/interface/serialize.cpp new file mode 100644 index 0000000000..ae05d9f50d --- /dev/null +++ b/yt/cpp/mapreduce/interface/serialize.cpp @@ -0,0 +1,553 @@ +#include "serialize.h" + +#include "common.h" +#include "fluent.h" + +#include <library/cpp/yson/parser.h> +#include <library/cpp/yson/node/node_io.h> +#include <library/cpp/yson/node/serialize.h> + +#include <library/cpp/type_info/type_io.h> + +#include <util/generic/string.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +// const auto& nodeMap = node.AsMap(); +#define DESERIALIZE_ITEM(NAME, MEMBER) \ + if (const auto* item = nodeMap.FindPtr(NAME)) { \ + Deserialize(MEMBER, *item); \ + } + +// const auto& attributesMap = node.GetAttributes().AsMap(); +#define DESERIALIZE_ATTR(NAME, MEMBER) \ + if (const auto* attr = attributesMap.FindPtr(NAME)) { \ + Deserialize(MEMBER, *attr); \ + } + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TSortColumn& sortColumn, NYson::IYsonConsumer* consumer) +{ + if (sortColumn.SortOrder() == ESortOrder::SO_ASCENDING) { + Serialize(sortColumn.Name(), consumer); + } else { + BuildYsonFluently(consumer).BeginMap() + .Item("name").Value(sortColumn.Name()) + .Item("sort_order").Value(ToString(sortColumn.SortOrder())) + .EndMap(); + } +} + +void Deserialize(TSortColumn& sortColumn, const TNode& node) +{ + if (node.IsString()) { + sortColumn = TSortColumn(node.AsString()); + } else if (node.IsMap()) { + const auto& name = node["name"].AsString(); + const auto& sortOrderString = node["sort_order"].AsString(); + sortColumn = TSortColumn(name, ::FromString<ESortOrder>(sortOrderString)); + } else { + ythrow yexception() << "Expected sort column to be string or map, got " << node.GetType(); + } +} + +template <class T, class TDerived> +void SerializeOneOrMany(const TOneOrMany<T, TDerived>& oneOrMany, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).List(oneOrMany.Parts_); +} + +template <class T, class TDerived> +void DeserializeOneOrMany(TOneOrMany<T, TDerived>& oneOrMany, const TNode& node) +{ + Deserialize(oneOrMany.Parts_, node); +} + +void Serialize(const TKey& key, NYson::IYsonConsumer* consumer) +{ + SerializeOneOrMany(key, consumer); +} + +void Deserialize(TKey& key, const TNode& node) +{ + DeserializeOneOrMany(key, node); +} + +void Serialize(const TSortColumns& sortColumns, NYson::IYsonConsumer* consumer) +{ + SerializeOneOrMany(sortColumns, consumer); +} + +void Deserialize(TSortColumns& sortColumns, const TNode& node) +{ + DeserializeOneOrMany(sortColumns, node); +} + +void Serialize(const TColumnNames& columnNames, NYson::IYsonConsumer* consumer) +{ + SerializeOneOrMany(columnNames, consumer); +} + +void Deserialize(TColumnNames& columnNames, const TNode& node) +{ + DeserializeOneOrMany(columnNames, node); +} + +//////////////////////////////////////////////////////////////////////////////// + +void Deserialize(EValueType& valueType, const TNode& node) +{ + const auto& nodeStr = node.AsString(); + static const THashMap<TString, EValueType> str2ValueType = { + {"int8", VT_INT8}, + {"int16", VT_INT16}, + {"int32", VT_INT32}, + {"int64", VT_INT64}, + + {"uint8", VT_UINT8}, + {"uint16", VT_UINT16}, + {"uint32", VT_UINT32}, + {"uint64", VT_UINT64}, + + {"boolean", VT_BOOLEAN}, + {"double", VT_DOUBLE}, + + {"string", VT_STRING}, + {"utf8", VT_UTF8}, + + {"any", VT_ANY}, + + {"null", VT_NULL}, + {"void", VT_VOID}, + + {"date", VT_DATE}, + {"datetime", VT_DATETIME}, + {"timestamp", VT_TIMESTAMP}, + {"interval", VT_INTERVAL}, + {"float", VT_FLOAT}, + {"json", VT_JSON}, + }; + + auto it = str2ValueType.find(nodeStr); + if (it == str2ValueType.end()) { + ythrow yexception() << "Invalid value type '" << nodeStr << "'"; + } + + valueType = it->second; +} + +void Deserialize(ESortOrder& sortOrder, const TNode& node) +{ + sortOrder = FromString<ESortOrder>(node.AsString()); +} + +void Deserialize(EOptimizeForAttr& optimizeFor, const TNode& node) +{ + optimizeFor = FromString<EOptimizeForAttr>(node.AsString()); +} + +void Deserialize(EErasureCodecAttr& erasureCodec, const TNode& node) +{ + erasureCodec = FromString<EErasureCodecAttr>(node.AsString()); +} + +void Deserialize(ESchemaModificationAttr& schemaModification, const TNode& node) +{ + schemaModification = FromString<ESchemaModificationAttr>(node.AsString()); +} + +void Serialize(const TColumnSchema& columnSchema, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginMap() + .Item("name").Value(columnSchema.Name()) + .DoIf(!columnSchema.RawTypeV3().Defined(), + [&] (TFluentMap fluent) { + fluent.Item("type").Value(NDetail::ToString(columnSchema.Type())); + fluent.Item("required").Value(columnSchema.Required()); + if (columnSchema.Type() == VT_ANY + && *columnSchema.TypeV3() != *NTi::Optional(NTi::Yson())) + { + // A lot of user canonize serialized schema. + // To be backward compatible we only set type_v3 for new types. + fluent.Item("type_v3").Value(columnSchema.TypeV3()); + } + } + ) + .DoIf(columnSchema.RawTypeV3().Defined(), [&] (TFluentMap fluent) { + const auto& rawTypeV3 = *columnSchema.RawTypeV3(); + fluent.Item("type_v3").Value(rawTypeV3); + + // We going set old fields `type` and `required` to be compatible + // with old clusters that doesn't support type_v3 yet. + + // if type is simple return its name otherwise return empty optional + auto isRequired = [](TStringBuf simpleType) { + return simpleType != "null" && simpleType != "void"; + }; + auto getSimple = [] (const TNode& typeV3) -> TMaybe<TString> { + static const THashMap<TString,TString> typeV3ToOld = { + {"bool", "boolean"}, + {"yson", "any"}, + }; + TMaybe<TString> result; + if (typeV3.IsString()) { + result = typeV3.AsString(); + } else if (typeV3.IsMap() && typeV3.Size() == 1) { + Y_VERIFY(typeV3["type_name"].IsString(), "invalid type is passed"); + result = typeV3["type_name"].AsString(); + } + if (result) { + auto it = typeV3ToOld.find(*result); + if (it != typeV3ToOld.end()) { + result = it->second; + } + } + return result; + }; + auto simplify = [&](const TNode& typeV3) -> TMaybe<std::pair<TString, bool>> { + auto simple = getSimple(typeV3); + if (simple) { + return std::make_pair(*simple, isRequired(*simple)); + } + if (typeV3.IsMap() && typeV3["type_name"] == "optional") { + auto simpleItem = getSimple(typeV3["item"]); + if (simpleItem && isRequired(*simpleItem)) { + return std::make_pair(*simpleItem, false); + } + } + return {}; + }; + + auto simplified = simplify(rawTypeV3); + + if (simplified) { + const auto& [simpleType, required] = *simplified; + fluent + .Item("type").Value(simpleType) + .Item("required").Value(required); + return; + } + }) + .DoIf(columnSchema.SortOrder().Defined(), [&] (TFluentMap fluent) { + fluent.Item("sort_order").Value(ToString(*columnSchema.SortOrder())); + }) + .DoIf(columnSchema.Lock().Defined(), [&] (TFluentMap fluent) { + fluent.Item("lock").Value(*columnSchema.Lock()); + }) + .DoIf(columnSchema.Expression().Defined(), [&] (TFluentMap fluent) { + fluent.Item("expression").Value(*columnSchema.Expression()); + }) + .DoIf(columnSchema.Aggregate().Defined(), [&] (TFluentMap fluent) { + fluent.Item("aggregate").Value(*columnSchema.Aggregate()); + }) + .DoIf(columnSchema.Group().Defined(), [&] (TFluentMap fluent) { + fluent.Item("group").Value(*columnSchema.Group()); + }) + .EndMap(); +} + +void Deserialize(TColumnSchema& columnSchema, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("name", columnSchema.Name_); + DESERIALIZE_ITEM("type_v3", columnSchema.RawTypeV3_); + DESERIALIZE_ITEM("sort_order", columnSchema.SortOrder_); + DESERIALIZE_ITEM("lock", columnSchema.Lock_); + DESERIALIZE_ITEM("expression", columnSchema.Expression_); + DESERIALIZE_ITEM("aggregate", columnSchema.Aggregate_); + DESERIALIZE_ITEM("group", columnSchema.Group_); + + if (nodeMap.contains("type_v3")) { + NTi::TTypePtr type; + DESERIALIZE_ITEM("type_v3", type); + columnSchema.Type(type); + } else { + EValueType oldType = VT_INT64; + bool required = false; + DESERIALIZE_ITEM("type", oldType); + DESERIALIZE_ITEM("required", required); + columnSchema.Type(ToTypeV3(oldType, required)); + } +} + +void Serialize(const TTableSchema& tableSchema, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginAttributes() + .Item("strict").Value(tableSchema.Strict()) + .Item("unique_keys").Value(tableSchema.UniqueKeys()) + .EndAttributes() + .List(tableSchema.Columns()); +} + +void Deserialize(TTableSchema& tableSchema, const TNode& node) +{ + const auto& attributesMap = node.GetAttributes().AsMap(); + DESERIALIZE_ATTR("strict", tableSchema.Strict_); + DESERIALIZE_ATTR("unique_keys", tableSchema.UniqueKeys_); + Deserialize(tableSchema.Columns_, node); +} + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TKeyBound& keyBound, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginList() + .Item().Value(ToString(keyBound.Relation())) + .Item().Value(keyBound.Key()) + .EndList(); +} + +void Deserialize(TKeyBound& keyBound, const TNode& node) +{ + const auto& nodeList = node.AsList(); + Y_ENSURE(nodeList.size() == 2); + + const auto& relationNode = nodeList[0]; + keyBound.Relation(::FromString<ERelation>(relationNode.AsString())); + + const auto& keyNode = nodeList[1]; + TKey key; + Deserialize(key, keyNode); + keyBound.Key(std::move(key)); +} + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TReadLimit& readLimit, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginMap() + .DoIf(readLimit.KeyBound_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("key_bound").Value(*readLimit.KeyBound_); + }) + .DoIf(readLimit.Key_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("key").Value(*readLimit.Key_); + }) + .DoIf(readLimit.RowIndex_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("row_index").Value(*readLimit.RowIndex_); + }) + .DoIf(readLimit.Offset_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("offset").Value(*readLimit.Offset_); + }) + .DoIf(readLimit.TabletIndex_.Defined(), [&] (TFluentMap fluent) { + fluent.Item("tablet_index").Value(*readLimit.TabletIndex_); + }) + .EndMap(); +} + +void Deserialize(TReadLimit& readLimit, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("key_bound", readLimit.KeyBound_); + DESERIALIZE_ITEM("key", readLimit.Key_); + DESERIALIZE_ITEM("row_index", readLimit.RowIndex_); + DESERIALIZE_ITEM("offset", readLimit.Offset_); + DESERIALIZE_ITEM("tablet_index", readLimit.TabletIndex_); +} + +void Serialize(const TReadRange& readRange, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginMap() + .DoIf(!IsTrivial(readRange.LowerLimit_), [&] (TFluentMap fluent) { + fluent.Item("lower_limit").Value(readRange.LowerLimit_); + }) + .DoIf(!IsTrivial(readRange.UpperLimit_), [&] (TFluentMap fluent) { + fluent.Item("upper_limit").Value(readRange.UpperLimit_); + }) + .DoIf(!IsTrivial(readRange.Exact_), [&] (TFluentMap fluent) { + fluent.Item("exact").Value(readRange.Exact_); + }) + .EndMap(); +} + +void Deserialize(TReadRange& readRange, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("lower_limit", readRange.LowerLimit_); + DESERIALIZE_ITEM("upper_limit", readRange.UpperLimit_); + DESERIALIZE_ITEM("exact", readRange.Exact_); +} + +void Serialize(const THashMap<TString, TString>& renameColumns, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer) + .DoMapFor(renameColumns, [] (TFluentMap fluent, const auto& item) { + fluent.Item(item.first).Value(item.second); + }); +} + +void Serialize(const TRichYPath& path, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).BeginAttributes() + .DoIf(path.GetRanges().Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("ranges").List(*path.GetRanges()); + }) + .DoIf(path.Columns_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("columns").Value(*path.Columns_); + }) + .DoIf(path.Append_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("append").Value(*path.Append_); + }) + .DoIf(path.PartiallySorted_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("partially_sorted").Value(*path.PartiallySorted_); + }) + .DoIf(!path.SortedBy_.Parts_.empty(), [&] (TFluentAttributes fluent) { + fluent.Item("sorted_by").Value(path.SortedBy_); + }) + .DoIf(path.Teleport_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("teleport").Value(*path.Teleport_); + }) + .DoIf(path.Primary_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("primary").Value(*path.Primary_); + }) + .DoIf(path.Foreign_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("foreign").Value(*path.Foreign_); + }) + .DoIf(path.RowCountLimit_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("row_count_limit").Value(*path.RowCountLimit_); + }) + .DoIf(path.FileName_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("file_name").Value(*path.FileName_); + }) + .DoIf(path.OriginalPath_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("original_path").Value(*path.OriginalPath_); + }) + .DoIf(path.Executable_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("executable").Value(*path.Executable_); + }) + .DoIf(path.Format_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("format").Value(*path.Format_); + }) + .DoIf(path.Schema_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("schema").Value(*path.Schema_); + }) + .DoIf(path.Timestamp_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("timestamp").Value(*path.Timestamp_); + }) + .DoIf(path.CompressionCodec_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("compression_codec").Value(*path.CompressionCodec_); + }) + .DoIf(path.ErasureCodec_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("erasure_codec").Value(ToString(*path.ErasureCodec_)); + }) + .DoIf(path.SchemaModification_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("schema_modification").Value(ToString(*path.SchemaModification_)); + }) + .DoIf(path.OptimizeFor_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("optimize_for").Value(ToString(*path.OptimizeFor_)); + }) + .DoIf(path.TransactionId_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("transaction_id").Value(GetGuidAsString(*path.TransactionId_)); + }) + .DoIf(path.RenameColumns_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("rename_columns").Value(*path.RenameColumns_); + }) + .DoIf(path.BypassArtifactCache_.Defined(), [&] (TFluentAttributes fluent) { + fluent.Item("bypass_artifact_cache").Value(*path.BypassArtifactCache_); + }) + .EndAttributes() + .Value(path.Path_); +} + +void Deserialize(TRichYPath& path, const TNode& node) +{ + path = {}; + + const auto& attributesMap = node.GetAttributes().AsMap(); + DESERIALIZE_ATTR("ranges", path.MutableRanges()); + DESERIALIZE_ATTR("columns", path.Columns_); + DESERIALIZE_ATTR("append", path.Append_); + DESERIALIZE_ATTR("partially_sorted", path.PartiallySorted_); + DESERIALIZE_ATTR("sorted_by", path.SortedBy_); + DESERIALIZE_ATTR("teleport", path.Teleport_); + DESERIALIZE_ATTR("primary", path.Primary_); + DESERIALIZE_ATTR("foreign", path.Foreign_); + DESERIALIZE_ATTR("row_count_limit", path.RowCountLimit_); + DESERIALIZE_ATTR("file_name", path.FileName_); + DESERIALIZE_ATTR("original_path", path.OriginalPath_); + DESERIALIZE_ATTR("executable", path.Executable_); + DESERIALIZE_ATTR("format", path.Format_); + DESERIALIZE_ATTR("schema", path.Schema_); + DESERIALIZE_ATTR("timestamp", path.Timestamp_); + DESERIALIZE_ATTR("compression_codec", path.CompressionCodec_); + DESERIALIZE_ATTR("erasure_codec", path.ErasureCodec_); + DESERIALIZE_ATTR("schema_modification", path.SchemaModification_); + DESERIALIZE_ATTR("optimize_for", path.OptimizeFor_); + DESERIALIZE_ATTR("transaction_id", path.TransactionId_); + DESERIALIZE_ATTR("rename_columns", path.RenameColumns_); + DESERIALIZE_ATTR("bypass_artifact_cache", path.BypassArtifactCache_); + Deserialize(path.Path_, node); +} + +void Serialize(const TAttributeFilter& filter, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).List(filter.Attributes_); +} + +void Deserialize(TTableColumnarStatistics& statistics, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("column_data_weights", statistics.ColumnDataWeight); + DESERIALIZE_ITEM("legacy_chunks_data_weight", statistics.LegacyChunksDataWeight); + DESERIALIZE_ITEM("timestamp_total_weight", statistics.TimestampTotalWeight); +} + +void Deserialize(TMultiTablePartition::TStatistics& statistics, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("chunk_count", statistics.ChunkCount); + DESERIALIZE_ITEM("data_weight", statistics.DataWeight); + DESERIALIZE_ITEM("row_count", statistics.RowCount); +} + +void Deserialize(TMultiTablePartition& partition, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("table_ranges", partition.TableRanges); + DESERIALIZE_ITEM("aggregate_statistics", partition.AggregateStatistics); +} + +void Deserialize(TMultiTablePartitions& partitions, const TNode& node) +{ + const auto& nodeMap = node.AsMap(); + DESERIALIZE_ITEM("partitions", partitions.Partitions); +} + +void Serialize(const TGUID& value, NYson::IYsonConsumer* consumer) +{ + BuildYsonFluently(consumer).Value(GetGuidAsString(value)); +} + +void Deserialize(TGUID& value, const TNode& node) +{ + value = GetGuid(node.AsString()); +} + +void Deserialize(TTabletInfo& value, const TNode& node) +{ + auto nodeMap = node.AsMap(); + DESERIALIZE_ITEM("total_row_count", value.TotalRowCount) + DESERIALIZE_ITEM("trimmed_row_count", value.TrimmedRowCount) + DESERIALIZE_ITEM("barrier_timestamp", value.BarrierTimestamp) +} + +void Serialize(const NTi::TTypePtr& type, NYson::IYsonConsumer* consumer) +{ + auto yson = NTi::NIo::SerializeYson(type.Get()); + ::NYson::ParseYsonStringBuffer(yson, consumer); +} + +void Deserialize(NTi::TTypePtr& type, const TNode& node) +{ + auto yson = NodeToYsonString(node, NYson::EYsonFormat::Binary); + type = NTi::NIo::DeserializeYson(*NTi::HeapFactory(), yson); +} + +#undef DESERIALIZE_ITEM +#undef DESERIALIZE_ATTR + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/serialize.h b/yt/cpp/mapreduce/interface/serialize.h new file mode 100644 index 0000000000..223dd446ba --- /dev/null +++ b/yt/cpp/mapreduce/interface/serialize.h @@ -0,0 +1,90 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/serialize.h +/// +/// Header containing declaration of functions for serializing to/from YSON. + +#include "common.h" + +#include <library/cpp/type_info/fwd.h> + +namespace NYT::NYson { +struct IYsonConsumer; +} // namespace NYT::NYson + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +template <class T> +void Deserialize(TMaybe<T>& value, const TNode& node) +{ + value.ConstructInPlace(); + Deserialize(value.GetRef(), node); +} + +template <class T> +void Deserialize(TVector<T>& value, const TNode& node) +{ + for (const auto& element : node.AsList()) { + value.emplace_back(); + Deserialize(value.back(), element); + } +} + +template <class T> +void Deserialize(THashMap<TString, T>& value, const TNode& node) +{ + for (const auto& item : node.AsMap()) { + Deserialize(value[item.first], item.second); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +void Serialize(const TKey& key, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TKey& key, const TNode& node); + +void Serialize(const TSortColumns& sortColumns, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TSortColumns& sortColumns, const TNode& node); + +void Serialize(const TColumnNames& columnNames, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TColumnNames& columnNames, const TNode& node); + +void Serialize(const TSortColumn& sortColumn, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TSortColumn& sortColumn, const TNode& node); + +void Serialize(const TKeyBound& keyBound, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TKeyBound& keyBound, const TNode& node); + +void Serialize(const TReadLimit& readLimit, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TReadLimit& readLimit, const TNode& node); + +void Serialize(const TReadRange& readRange, NYT::NYson::IYsonConsumer* consumer); + +void Serialize(const TRichYPath& path, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TRichYPath& path, const TNode& node); + +void Serialize(const TAttributeFilter& filter, NYT::NYson::IYsonConsumer* consumer); + +void Serialize(const TColumnSchema& columnSchema, NYT::NYson::IYsonConsumer* consumer); +void Serialize(const TTableSchema& tableSchema, NYT::NYson::IYsonConsumer* consumer); + +void Deserialize(EValueType& valueType, const TNode& node); +void Deserialize(TTableSchema& tableSchema, const TNode& node); +void Deserialize(TColumnSchema& columnSchema, const TNode& node); +void Deserialize(TTableColumnarStatistics& statistics, const TNode& node); +void Deserialize(TMultiTablePartition& partition, const TNode& node); +void Deserialize(TMultiTablePartitions& partitions, const TNode& node); +void Deserialize(TTabletInfo& tabletInfos, const TNode& node); + +void Serialize(const TGUID& path, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(TGUID& value, const TNode& node); + +void Serialize(const NTi::TTypePtr& type, NYT::NYson::IYsonConsumer* consumer); +void Deserialize(NTi::TTypePtr& type, const TNode& node); + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/serialize_ut.cpp b/yt/cpp/mapreduce/interface/serialize_ut.cpp new file mode 100644 index 0000000000..59d4501ee8 --- /dev/null +++ b/yt/cpp/mapreduce/interface/serialize_ut.cpp @@ -0,0 +1,49 @@ +#include <yt/cpp/mapreduce/interface/serialize.h> +#include <yt/cpp/mapreduce/interface/common.h> + +#include <library/cpp/yson/node/node_builder.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/serialized_enum.h> + +using namespace NYT; + +Y_UNIT_TEST_SUITE(Serialization) +{ + Y_UNIT_TEST(TableSchema) + { + auto schema = TTableSchema() + .AddColumn(TColumnSchema().Name("a").Type(EValueType::VT_STRING).SortOrder(SO_ASCENDING)) + .AddColumn(TColumnSchema().Name("b").Type(EValueType::VT_UINT64)) + .AddColumn(TColumnSchema().Name("c").Type(EValueType::VT_INT64, true)); + + auto schemaNode = schema.ToNode(); + UNIT_ASSERT(schemaNode.IsList()); + UNIT_ASSERT_VALUES_EQUAL(schemaNode.Size(), 3); + + + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["name"], "a"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["type"], "string"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["required"], false); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[0]["sort_order"], "ascending"); + + UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["name"], "b"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["type"], "uint64"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[1]["required"], false); + + UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["name"], "c"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["type"], "int64"); + UNIT_ASSERT_VALUES_EQUAL(schemaNode[2]["required"], true); + } + + Y_UNIT_TEST(ValueTypeSerialization) + { + for (const auto value : GetEnumAllValues<EValueType>()) { + TNode serialized = NYT::NDetail::ToString(value); + EValueType deserialized; + Deserialize(deserialized, serialized); + UNIT_ASSERT_VALUES_EQUAL(value, deserialized); + } + } +} diff --git a/yt/cpp/mapreduce/interface/skiff_row.cpp b/yt/cpp/mapreduce/interface/skiff_row.cpp new file mode 100644 index 0000000000..7838bdaee9 --- /dev/null +++ b/yt/cpp/mapreduce/interface/skiff_row.cpp @@ -0,0 +1 @@ +#include "skiff_row.h" diff --git a/yt/cpp/mapreduce/interface/skiff_row.h b/yt/cpp/mapreduce/interface/skiff_row.h new file mode 100644 index 0000000000..5dd335cb65 --- /dev/null +++ b/yt/cpp/mapreduce/interface/skiff_row.h @@ -0,0 +1,127 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/skiff_row.h +/// Header containing interfaces that you need to define for using TSkiffRowTableReader +/// What you need to do for your struct type TMyType: +/// 1. Write `true` specialization TIsSkiffRow<TMyType>; +/// 2. Write specialization GetSkiffSchema<TMyType>(); +/// 3. Write your own parser derived from ISkiffRowParser and write specialization GetSkiffParser<TMyType>() which returns this parser. + +#include "fwd.h" + +#include <yt/cpp/mapreduce/skiff/skiff_schema.h> + +#include <yt/cpp/mapreduce/interface/format.h> + +#include <library/cpp/skiff/skiff.h> + +#include <util/generic/maybe.h> + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +//! Need to write `true_type` specialization for your row type `T`. +/// And implement two functions: `GetSkiffSchema` and `CreateSkiffParser`. +/// +/// Example: +/// +/// template <> +/// struct TIsSkiffRow<T> +/// : std::true_type +/// { }; +/// +template<class T> +struct TIsSkiffRow + : std::false_type +{ }; + +//////////////////////////////////////////////////////////////////////////////// + +//! Return skiff schema for row type `T`. +/// Need to write its specialization. +template <typename T> +NSkiff::TSkiffSchemaPtr GetSkiffSchema(const TMaybe<TSkiffRowHints>& /*hints*/) +{ + static_assert(TDependentFalse<T>, "Unimplemented `GetSkiffSchema` method"); +} + +//////////////////////////////////////////////////////////////////////////////// + +//! Allow to parse rows as user's structs from stream (TCheckedInDebugSkiffParser). +/// Need to write derived class for your own row type. +/// +/// Example: +/// +/// class TMySkiffRowParser : public ISkiffRowParser +/// { +/// public: +/// TMySkiffRowParser(TMySkiffRow* row) +/// : Row_(row) +/// {} +/// +/// void Parse(NSkiff::TCheckedInDebugSkiffParser* parser) +/// . { +/// Row_->SomeInt64Field = parser->ParseInt64(); +/// } +/// +/// private: +/// TMySkiffRow* Row_; +/// } +/// +class ISkiffRowParser + : public TThrRefBase +{ +public: + //! Read one row from parser + virtual void Parse(NSkiff::TCheckedInDebugSkiffParser* /*parser*/) = 0; +}; + +//! Creates a parser for row type `T`. +template <typename T> +ISkiffRowParserPtr CreateSkiffParser(T* /*row*/, const TMaybe<TSkiffRowHints>& /*hints*/) +{ + static_assert(TDependentFalse<T>, "Unimplemented `CreateSkiffParser` function"); +} + +//////////////////////////////////////////////////////////////////////////////// + +//! Allow to skip row content without getting row. +/// By default row will be parsed using your parser derived from ISkiffRowParser. +/// If you want, you can write more optimal skipper, but it isn't required. +class ISkiffRowSkipper + : public TThrRefBase +{ +public: + virtual void SkipRow(NSkiff::TCheckedInDebugSkiffParser* /*parser*/) = 0; +}; + +//! Default ISkiffRowSkipper implementation. +template <typename T> +class TSkiffRowSkipper : public ISkiffRowSkipper { +public: + explicit TSkiffRowSkipper(const TMaybe<TSkiffRowHints>& hints) + : Parser_(CreateSkiffParser<T>(&Row_, hints)) + { } + + void SkipRow(NSkiff::TCheckedInDebugSkiffParser* parser) { + Parser_->Parse(parser); + } + +private: + T Row_; + ISkiffRowParserPtr Parser_; +}; + +//! Creates a skipper for row type 'T'. +/// You don't need to write its specialization. +template <typename T> +ISkiffRowSkipperPtr CreateSkiffSkipper(const TMaybe<TSkiffRowHints>& hints) +{ + return ::MakeIntrusive<TSkiffRowSkipper<T>>(hints); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/tvm.cpp b/yt/cpp/mapreduce/interface/tvm.cpp new file mode 100644 index 0000000000..bfa3f0304e --- /dev/null +++ b/yt/cpp/mapreduce/interface/tvm.cpp @@ -0,0 +1 @@ +#include "tvm.h" diff --git a/yt/cpp/mapreduce/interface/tvm.h b/yt/cpp/mapreduce/interface/tvm.h new file mode 100644 index 0000000000..d8d16d841b --- /dev/null +++ b/yt/cpp/mapreduce/interface/tvm.h @@ -0,0 +1,35 @@ +#pragma once + +#include <yt/yt/library/tvm/tvm_base.h> + +#include <library/cpp/yt/memory/intrusive_ptr.h> + +namespace NYT::NAuth { + +//////////////////////////////////////////////////////////////////////////////// + +/// This wrapper is required because NYT::NAuth::IServiceTicketAuthPtr is NYT::TIntrusivePtr, +/// and, if we used this pointer in interfaces of `mapreduce/yt` client, a lot of users of this library +/// could get unexpected build errors that `TIntrusivePtr` is ambigious +/// (from `::` namespace and from `::NYT::` namespace). +/// So we use this wrapper in our interfaces to avoid such problems for users. +struct IServiceTicketAuthPtrWrapper +{ + // + /// Construct wrapper from NYT::TIntrusivePtr + /// + /// This constructor is implicit so users can transparently pass NYT::TIntrusivePtr to the functions of + /// mapreduce/yt client. + template <class T, class = typename std::enable_if_t<std::is_convertible_v<T*, IServiceTicketAuth*>>> + IServiceTicketAuthPtrWrapper(const TIntrusivePtr<T> ptr) + : Ptr(ptr) + { + } + + /// Wrapped pointer + NYT::TIntrusivePtr<IServiceTicketAuth> Ptr; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT::NAuth diff --git a/yt/cpp/mapreduce/interface/ut/ya.make b/yt/cpp/mapreduce/interface/ut/ya.make new file mode 100644 index 0000000000..0219e6430c --- /dev/null +++ b/yt/cpp/mapreduce/interface/ut/ya.make @@ -0,0 +1,25 @@ +UNITTEST_FOR(yt/cpp/mapreduce/interface) + +SRCS( + common_ut.cpp + config_ut.cpp + error_ut.cpp + format_ut.cpp + job_counters_ut.cpp + job_statistics_ut.cpp + operation_ut.cpp + proto3_ut.proto + protobuf_table_schema_ut.cpp + protobuf_file_options_ut.cpp + protobuf_table_schema_ut.proto + protobuf_file_options_ut.proto + serialize_ut.cpp +) + +PEERDIR( + contrib/libs/protobuf + library/cpp/testing/unittest + yt/yt_proto/yt/formats +) + +END() diff --git a/yt/cpp/mapreduce/interface/wait_proxy.h b/yt/cpp/mapreduce/interface/wait_proxy.h new file mode 100644 index 0000000000..f7d8e0638e --- /dev/null +++ b/yt/cpp/mapreduce/interface/wait_proxy.h @@ -0,0 +1,54 @@ +#pragma once + +/// +/// @file yt/cpp/mapreduce/interface/serialize.h +/// +/// Header containing interface to enable customizable waiting. + +#include <yt/cpp/mapreduce/interface/common.h> + +#include <util/datetime/base.h> + +namespace NThreading { +template <typename T> +class TFuture; +} + +class TSystemEvent; +class TCondVar; +class TMutex; + +namespace NYT { + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// @brief Interface to facilitate customizable waiting. +/// +/// All the waiting functions in the library are obliged to use the methods of a wait proxy instead of direct function calls. +class IWaitProxy + : public TThrRefBase +{ +public: + virtual ~IWaitProxy() = default; + + /// + /// @brief Wait for the future setting with timeout. + virtual bool WaitFuture(const ::NThreading::TFuture<void>& future, TDuration timeout) = 0; + + /// + /// @brief Wait for a system event with timeout. + virtual bool WaitEvent(TSystemEvent& event, TDuration timeout) = 0; + + /// + /// @brief Wait for the notification on the condition variable with timeout. + virtual bool WaitCondVar(TCondVar& condVar, TMutex& mutex, TDuration timeout) = 0; + + /// + /// @brief Sleep in the current thread for (approximately) specified amount of time. + virtual void Sleep(TDuration timeout) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace NYT diff --git a/yt/cpp/mapreduce/interface/ya.make b/yt/cpp/mapreduce/interface/ya.make new file mode 100644 index 0000000000..0e94f14633 --- /dev/null +++ b/yt/cpp/mapreduce/interface/ya.make @@ -0,0 +1,46 @@ +LIBRARY() + +INCLUDE(${ARCADIA_ROOT}/yt/ya_cpp.make.inc) + +SRCS( + batch_request.cpp + client.cpp + client_method_options.cpp + common.cpp + config.cpp + cypress.cpp + errors.cpp + format.cpp + job_counters.cpp + job_statistics.cpp + io.cpp + operation.cpp + protobuf_format.cpp + serialize.cpp + skiff_row.cpp + tvm.cpp +) + +PEERDIR( + contrib/libs/protobuf + library/cpp/type_info + library/cpp/threading/future + library/cpp/yson/node + yt/cpp/mapreduce/interface/logging + yt/yt_proto/yt/formats + yt/yt/library/tvm +) + +GENERATE_ENUM_SERIALIZATION(client_method_options.h) +GENERATE_ENUM_SERIALIZATION(client.h) +GENERATE_ENUM_SERIALIZATION(common.h) +GENERATE_ENUM_SERIALIZATION(config.h) +GENERATE_ENUM_SERIALIZATION(cypress.h) +GENERATE_ENUM_SERIALIZATION(job_counters.h) +GENERATE_ENUM_SERIALIZATION(job_statistics.h) +GENERATE_ENUM_SERIALIZATION(operation.h) +GENERATE_ENUM_SERIALIZATION(protobuf_format.h) + +END() + +RECURSE_FOR_TESTS(ut) |