diff options
author | alexv-smirnov <alex@ydb.tech> | 2023-09-19 12:14:42 +0300 |
---|---|---|
committer | alexv-smirnov <alex@ydb.tech> | 2023-09-19 12:43:02 +0300 |
commit | 4662db373a1785a836134a4b2342e8471342bdda (patch) | |
tree | 2df6840dc71039dbcb4192a4b6b2edbf5f26248d /library/cpp/protobuf/yql | |
parent | 0364421e62126bcf971c82084d3c923be85c77c7 (diff) | |
download | ydb-4662db373a1785a836134a4b2342e8471342bdda.tar.gz |
Clone protobuf_udf to ydb/library
Diffstat (limited to 'library/cpp/protobuf/yql')
-rw-r--r-- | library/cpp/protobuf/yql/CMakeLists.darwin-x86_64.txt | 22 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/CMakeLists.linux-aarch64.txt | 23 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/CMakeLists.linux-x86_64.txt | 23 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/CMakeLists.txt | 17 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/CMakeLists.windows-x86_64.txt | 22 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/README.md | 26 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/descriptor.cpp | 314 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/descriptor.h | 161 | ||||
-rw-r--r-- | library/cpp/protobuf/yql/ya.make | 15 |
9 files changed, 623 insertions, 0 deletions
diff --git a/library/cpp/protobuf/yql/CMakeLists.darwin-x86_64.txt b/library/cpp/protobuf/yql/CMakeLists.darwin-x86_64.txt new file mode 100644 index 0000000000..112f6e94e7 --- /dev/null +++ b/library/cpp/protobuf/yql/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,22 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-protobuf-yql) +target_link_libraries(cpp-protobuf-yql PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf + library-cpp-json + cpp-protobuf-dynamic_prototype + cpp-protobuf-json + cpp-string_utils-base64 +) +target_sources(cpp-protobuf-yql PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/protobuf/yql/descriptor.cpp +) diff --git a/library/cpp/protobuf/yql/CMakeLists.linux-aarch64.txt b/library/cpp/protobuf/yql/CMakeLists.linux-aarch64.txt new file mode 100644 index 0000000000..de09662231 --- /dev/null +++ b/library/cpp/protobuf/yql/CMakeLists.linux-aarch64.txt @@ -0,0 +1,23 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-protobuf-yql) +target_link_libraries(cpp-protobuf-yql PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf + library-cpp-json + cpp-protobuf-dynamic_prototype + cpp-protobuf-json + cpp-string_utils-base64 +) +target_sources(cpp-protobuf-yql PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/protobuf/yql/descriptor.cpp +) diff --git a/library/cpp/protobuf/yql/CMakeLists.linux-x86_64.txt b/library/cpp/protobuf/yql/CMakeLists.linux-x86_64.txt new file mode 100644 index 0000000000..de09662231 --- /dev/null +++ b/library/cpp/protobuf/yql/CMakeLists.linux-x86_64.txt @@ -0,0 +1,23 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-protobuf-yql) +target_link_libraries(cpp-protobuf-yql PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf + library-cpp-json + cpp-protobuf-dynamic_prototype + cpp-protobuf-json + cpp-string_utils-base64 +) +target_sources(cpp-protobuf-yql PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/protobuf/yql/descriptor.cpp +) diff --git a/library/cpp/protobuf/yql/CMakeLists.txt b/library/cpp/protobuf/yql/CMakeLists.txt new file mode 100644 index 0000000000..f8b31df0c1 --- /dev/null +++ b/library/cpp/protobuf/yql/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/library/cpp/protobuf/yql/CMakeLists.windows-x86_64.txt b/library/cpp/protobuf/yql/CMakeLists.windows-x86_64.txt new file mode 100644 index 0000000000..112f6e94e7 --- /dev/null +++ b/library/cpp/protobuf/yql/CMakeLists.windows-x86_64.txt @@ -0,0 +1,22 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(cpp-protobuf-yql) +target_link_libraries(cpp-protobuf-yql PUBLIC + contrib-libs-cxxsupp + yutil + contrib-libs-protobuf + library-cpp-json + cpp-protobuf-dynamic_prototype + cpp-protobuf-json + cpp-string_utils-base64 +) +target_sources(cpp-protobuf-yql PRIVATE + ${CMAKE_SOURCE_DIR}/library/cpp/protobuf/yql/descriptor.cpp +) diff --git a/library/cpp/protobuf/yql/README.md b/library/cpp/protobuf/yql/README.md new file mode 100644 index 0000000000..622bfce841 --- /dev/null +++ b/library/cpp/protobuf/yql/README.md @@ -0,0 +1,26 @@ +Функции данной библиотеки, позволяют генерировать метаинформацию, необходимую YQL для того, чтобы работать с содержимым protobuf-сообщения как со структурой. + +Пример задания атрибута для таблицы в YT: +```c++ +#include <proto/person.pb.h> +#include <library/cpp/protobuf/yql/descriptor.h> + +auto client = NYT::CreateClient(ServerName); +client->Create(OutputPath, NT_TABLE, + TCreateOptions().Attributes(NYT::TNode() + ("_yql_proto_field_Data", GenerateProtobufTypeConfig<TPersonProto>()) +); +``` +Далее, в YQL можно будет написать следующий запрос: +```sql +SELECT t.Data.Name, t.Data.Age, FROM [table] AS t; +``` + +Для того, чтобы работать с protobuf-сообщениями, сохранёнными в разных представлениях есть специальные опции. На данный момент поддерживается три формата представления данных - binary-protobuf, text-protbuf и json. + +Пример использования опций: +```c++ +GenerateProtobufTypeConfig<TPersonProto>( + TProtoTypeConfigOptions().SetProtoFormat(PF_PROTOTEXT)) +); +``` diff --git a/library/cpp/protobuf/yql/descriptor.cpp b/library/cpp/protobuf/yql/descriptor.cpp new file mode 100644 index 0000000000..2d5a154fc1 --- /dev/null +++ b/library/cpp/protobuf/yql/descriptor.cpp @@ -0,0 +1,314 @@ +#include "descriptor.h" + +#include <library/cpp/json/json_reader.h> +#include <library/cpp/json/json_writer.h> +#include <library/cpp/protobuf/dynamic_prototype/dynamic_prototype.h> +#include <library/cpp/protobuf/dynamic_prototype/generate_file_descriptor_set.h> +#include <library/cpp/protobuf/json/json2proto.h> +#include <library/cpp/protobuf/json/proto2json.h> +#include <library/cpp/string_utils/base64/base64.h> + +#include <util/generic/hash.h> +#include <util/generic/queue.h> +#include <util/generic/set.h> +#include <util/generic/vector.h> +#include <util/stream/mem.h> +#include <util/stream/str.h> +#include <util/stream/zlib.h> +#include <util/string/cast.h> + +#include <google/protobuf/text_format.h> +#include <google/protobuf/io/zero_copy_stream_impl_lite.h> + +using namespace NProtoBuf; + +static TString SerializeFileDescriptorSet(const FileDescriptorSet& proto) { + const auto size = proto.ByteSize(); + TTempBuf data(size); + proto.SerializeWithCachedSizesToArray((ui8*)data.Data()); + + TStringStream str; + { + TZLibCompress comp(&str, ZLib::GZip); + comp.Write(data.Data(), size); + } + return str.Str(); +} + +static bool ParseFileDescriptorSet(const TStringBuf& data, FileDescriptorSet* proto) { + TMemoryInput input(data.data(), data.size()); + TString buf = TZLibDecompress(&input).ReadAll(); + + if (!proto->ParseFromArray(buf.data(), buf.size())) { + return false; + } + return true; +} + +TDynamicInfo::TDynamicInfo(TDynamicPrototypePtr dynamicPrototype) + : DynamicPrototype(dynamicPrototype) + , SkipBytes_(0) +{ +} + +TDynamicInfo::~TDynamicInfo() { +} + +TDynamicInfoRef TDynamicInfo::Create(const TStringBuf& typeConfig) { + auto data = ParseTypeConfig(typeConfig); + const TString& meta = Base64Decode(data.Metadata); + const TString& name = data.MessageName; + FileDescriptorSet set; + + if (!ParseFileDescriptorSet(meta, &set)) { + ythrow yexception() << "can't parse metadata"; + } + + auto info = MakeIntrusive<TDynamicInfo>(TDynamicPrototype::Create(set, name, true)); + + info->EnumFormat_ = data.EnumFormat; + info->ProtoFormat_ = data.ProtoFormat; + info->Recursion_ = data.Recursion; + info->YtMode_ = data.YtMode; + info->SkipBytes_ = data.SkipBytes; + info->OptionalLists_ = data.OptionalLists; + info->SyntaxAware_ = data.SyntaxAware; + return info; +} + +const Descriptor* TDynamicInfo::Descriptor() const { + return DynamicPrototype->GetDescriptor(); +} + +EEnumFormat TDynamicInfo::GetEnumFormat() const { + return EnumFormat_; +} + +ERecursionTraits TDynamicInfo::GetRecursionTraits() const { + return Recursion_; +} + +bool TDynamicInfo::GetYtMode() const { + return YtMode_; +} + +bool TDynamicInfo::GetOptionalLists() const { + return OptionalLists_; +} + +bool TDynamicInfo::GetSyntaxAware() const { + return SyntaxAware_; +} + +TAutoPtr<Message> TDynamicInfo::MakeProto() { + return DynamicPrototype->CreateUnsafe(); +} + +TAutoPtr<Message> TDynamicInfo::Parse(const TStringBuf& data) { + auto mut = MakeProto(); + TStringBuf tmp(data); + + if (SkipBytes_) { + tmp = TStringBuf(tmp.data() + SkipBytes_, tmp.size() - SkipBytes_); + } + + switch (ProtoFormat_) { + case PF_PROTOBIN: { + if (!mut->ParseFromArray(tmp.data(), tmp.size())) { + ythrow yexception() << "can't parse protobin message"; + } + break; + } + case PF_PROTOTEXT: { + io::ArrayInputStream si(tmp.data(), tmp.size()); + if (!TextFormat::Parse(&si, mut.Get())) { + ythrow yexception() << "can't parse prototext message"; + } + break; + } + case PF_JSON: { + NJson::TJsonValue value; + + if (NJson::ReadJsonFastTree(tmp, &value)) { + NProtobufJson::Json2Proto(value, *mut); + } else { + ythrow yexception() << "can't parse json value"; + } + break; + } + } + + return mut; +} + +TString TDynamicInfo::Serialize(const Message& proto) { + TString result; + switch (ProtoFormat_) { + case PF_PROTOBIN: { + result.ReserveAndResize(proto.ByteSize()); + if (!proto.SerializeToArray(result.begin(), result.size())) { + ythrow yexception() << "can't serialize protobin message"; + } + break; + } + case PF_PROTOTEXT: { + if (!TextFormat::PrintToString(proto, &result)) { + ythrow yexception() << "can't serialize prototext message"; + } + break; + } + case PF_JSON: { + NJson::TJsonValue value; + NProtobufJson::Proto2Json(proto, value); + result = NJson::WriteJson(value); + break; + } + } + return result; +} + +TString GenerateProtobufTypeConfig( + const Descriptor* descriptor, + const TProtoTypeConfigOptions& options) { + NJson::TJsonValue ret(NJson::JSON_MAP); + + ret["name"] = descriptor->full_name(); + ret["meta"] = Base64Encode( + SerializeFileDescriptorSet(GenerateFileDescriptorSet(descriptor))); + + if (options.SkipBytes > 0) { + ret["skip"] = options.SkipBytes; + } + + switch (options.ProtoFormat) { + case PF_PROTOBIN: + break; + case PF_PROTOTEXT: + ret["format"] = "prototext"; + break; + case PF_JSON: + ret["format"] = "json"; + break; + } + + if (!options.OptionalLists) { + ret["lists"]["optional"] = false; + } + + if (options.SyntaxAware) { + ret["syntax"]["aware"] = options.SyntaxAware; + } + + switch (options.EnumFormat) { + case EEnumFormat::Number: + break; + case EEnumFormat::Name: + ret["view"]["enum"] = "name"; + break; + case EEnumFormat::FullName: + ret["view"]["enum"] = "full_name"; + break; + } + + switch (options.Recursion) { + case ERecursionTraits::Fail: + break; + case ERecursionTraits::Ignore: + ret["view"]["recursion"] = "ignore"; + break; + case ERecursionTraits::Bytes: + ret["view"]["recursion"] = "bytes"; + break; + } + + if (options.YtMode) { + ret["view"]["yt_mode"] = true; + } + + return NJson::WriteJson(ret, false); +} + +TProtoTypeConfig ParseTypeConfig(const TStringBuf& config) { + if (config.empty()) { + ythrow yexception() << "empty metadata"; + } + + switch (config[0]) { + case '#': { + auto plus = config.find('+'); + + if (config[0] != '#') { + ythrow yexception() << "unknown version of metadata format"; + } + if (plus == TStringBuf::npos) { + ythrow yexception() << "invalid metadata"; + } + + TProtoTypeConfig result; + + result.MessageName = TStringBuf(config.begin() + 1, plus - 1); + result.Metadata = TStringBuf(config.begin() + 1 + plus, config.size() - plus - 1); + result.SkipBytes = 0; + + return result; + } + + case '{': { + NJson::TJsonValue value; + + if (NJson::ReadJsonFastTree(config, &value)) { + TProtoTypeConfig result; + TString protoFormat = value["format"].GetStringSafe("protobin"); + TString enumFormat = value["view"]["enum"].GetStringSafe("number"); + TString recursion = value["view"]["recursion"].GetStringSafe("fail"); + + result.MessageName = value["name"].GetString(); + result.Metadata = value["meta"].GetString(); + result.SkipBytes = value["skip"].GetIntegerSafe(0); + result.OptionalLists = value["lists"]["optional"].GetBooleanSafe(true); + result.SyntaxAware = value["syntax"]["aware"].GetBooleanSafe(false); + result.YtMode = value["view"]["yt_mode"].GetBooleanSafe(false); + + if (protoFormat == "protobin") { + result.ProtoFormat = PF_PROTOBIN; + } else if (protoFormat == "prototext") { + result.ProtoFormat = PF_PROTOTEXT; + } else if (protoFormat == "json") { + result.ProtoFormat = PF_JSON; + } else { + ythrow yexception() << "unsupported format " << protoFormat; + } + + if (enumFormat == "number") { + result.EnumFormat = EEnumFormat::Number; + } else if (enumFormat == "name") { + result.EnumFormat = EEnumFormat::Name; + } else if (enumFormat == "full_name") { + result.EnumFormat = EEnumFormat::FullName; + } else { + ythrow yexception() << "unsupported enum representation " + << enumFormat; + } + + if (recursion == "fail") { + result.Recursion = ERecursionTraits::Fail; + } else if (recursion == "ignore") { + result.Recursion = ERecursionTraits::Ignore; + } else if (recursion == "bytes") { + result.Recursion = ERecursionTraits::Bytes; + } else { + ythrow yexception() << "unsupported recursion trait " + << recursion; + } + + return result; + } else { + ythrow yexception() << "can't parse json metadata"; + } + } + + default: + ythrow yexception() << "invalid control char " + << TStringBuf(config.data(), 1); + } +} diff --git a/library/cpp/protobuf/yql/descriptor.h b/library/cpp/protobuf/yql/descriptor.h new file mode 100644 index 0000000000..bbed6850ec --- /dev/null +++ b/library/cpp/protobuf/yql/descriptor.h @@ -0,0 +1,161 @@ +#pragma once + +#include <google/protobuf/descriptor.h> +#include <google/protobuf/dynamic_message.h> +#include <google/protobuf/descriptor.pb.h> + +#include <library/cpp/protobuf/dynamic_prototype/dynamic_prototype.h> + +#include <util/generic/ptr.h> + +enum EProtoFormat { + PF_PROTOBIN = 0, + PF_PROTOTEXT = 1, + PF_JSON = 2, +}; + +enum class EEnumFormat { + Number = 0, + Name = 1, + FullName = 2, +}; + +enum class ERecursionTraits { + //! Падать, если на входе имеется рекурсивно определённый тип. + Fail = 0, + //! Игнорировать все поля с рекурсивно определённым типом. + Ignore = 1, + //! Возвращать поля с рекурсивным типом в виде сериализованной строки + Bytes = 2, +}; + +struct TProtoTypeConfig { + //! Имя сообщения. + TString MessageName; + //! Сериализованные метаданные. + TString Metadata; + //! Количество байт, которые надо отступить + //! от начала каждого блока данных. + ui32 SkipBytes = 0; + //! Формат сериализации proto-сообщений. + EProtoFormat ProtoFormat = PF_PROTOBIN; + //! Формат представление значений Enum. + EEnumFormat EnumFormat = EEnumFormat::Number; + //! Способ интерпретации рекурсивно определённых типов. + ERecursionTraits Recursion = ERecursionTraits::Fail; + //! Выдать бинарными строками, если SERIALIZATION_PROTOBUF. + //! Если SERIALIZATION_YT, то для рекурсивных структур поведение зависит от Recursion. + bool YtMode = false; + //! Заворачивать ли списочные типы в Optional. + bool OptionalLists = false; + //! Заполнять ли пустые Optional типы дефолтным значением (только для proto3). + bool SyntaxAware = false; +}; + +struct TProtoTypeConfigOptions { + //! Количество байт, которые надо отступить + //! от начала каждого блока данных. + ui32 SkipBytes = 0; + //! Формат сериализации proto-сообщений. + EProtoFormat ProtoFormat = PF_PROTOBIN; + //! Формат представление значений Enum. + EEnumFormat EnumFormat = EEnumFormat::Number; + //! Способ интерпретации рекурсивно определённых типов. + ERecursionTraits Recursion = ERecursionTraits::Fail; + //! Выдать бинарными строками, если SERIALIZATION_PROTOBUF. + //! Если SERIALIZATION_YT, то для рекурсивных структур поведение зависит от Recursion. + bool YtMode = false; + //! Заворачивать ли списочные типы в Optional. + bool OptionalLists = false; + //! Заполнять ли пустые Optional типы дефолтным значением (только для proto3). + bool SyntaxAware = false; + + TProtoTypeConfigOptions& SetProtoFormat(EProtoFormat value) { + ProtoFormat = value; + return *this; + } + + TProtoTypeConfigOptions& SetSkipBytes(ui32 value) { + SkipBytes = value; + return *this; + } + + TProtoTypeConfigOptions& SetEnumFormat(EEnumFormat value) { + EnumFormat = value; + return *this; + } + + TProtoTypeConfigOptions& SetRecursionTraits(ERecursionTraits value) { + Recursion = value; + return *this; + } + + TProtoTypeConfigOptions& SetYtMode(bool value) { + YtMode = value; + return *this; + } + + TProtoTypeConfigOptions& SetOptionalLists(bool value) { + OptionalLists = value; + return *this; + } + + TProtoTypeConfigOptions& SetSyntaxAware(bool value) { + SyntaxAware = value; + return *this; + } +}; + +TString GenerateProtobufTypeConfig( + const NProtoBuf::Descriptor* descriptor, + const TProtoTypeConfigOptions& options = TProtoTypeConfigOptions()); + +template <typename T> +inline TString GenerateProtobufTypeConfig( + const TProtoTypeConfigOptions& options = TProtoTypeConfigOptions()) { + return GenerateProtobufTypeConfig(T::descriptor(), options); +} + +TProtoTypeConfig ParseTypeConfig(const TStringBuf& config); + +using TDynamicInfoRef = TIntrusivePtr<class TDynamicInfo>; + +class TDynamicInfo: public TSimpleRefCount<TDynamicInfo> { +public: + TDynamicInfo(TDynamicPrototypePtr); + ~TDynamicInfo(); + + static TDynamicInfoRef Create(const TStringBuf& typeConfig); + + const NProtoBuf::Descriptor* Descriptor() const; + + //! Текущий формат представление значений Enum. + EEnumFormat GetEnumFormat() const; + + //! Текущий способ интерпретации рекурсивно определённых типов. + ERecursionTraits GetRecursionTraits() const; + + bool GetYtMode() const; + + bool GetOptionalLists() const; + + bool GetSyntaxAware() const; + + TAutoPtr<NProtoBuf::Message> MakeProto(); + + //! \param data сериализованное protobuf-сообщение. + TAutoPtr<NProtoBuf::Message> Parse(const TStringBuf& data); + + //! \param proto protobuf-сообщение. + TString Serialize(const NProtoBuf::Message& proto); + +private: + TDynamicPrototypePtr DynamicPrototype; + EEnumFormat EnumFormat_; + EProtoFormat ProtoFormat_; + ERecursionTraits Recursion_; + bool YtMode_; + ui32 SkipBytes_; + bool OptionalLists_; + bool SyntaxAware_; +}; diff --git a/library/cpp/protobuf/yql/ya.make b/library/cpp/protobuf/yql/ya.make new file mode 100644 index 0000000000..29c21da402 --- /dev/null +++ b/library/cpp/protobuf/yql/ya.make @@ -0,0 +1,15 @@ +LIBRARY() + +SRCS( + descriptor.cpp +) + +PEERDIR( + contrib/libs/protobuf + library/cpp/json + library/cpp/protobuf/dynamic_prototype + library/cpp/protobuf/json + library/cpp/string_utils/base64 +) + +END() |