diff options
author | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-14 09:58:56 +0300 |
---|---|---|
committer | vitalyisaev <vitalyisaev@ydb.tech> | 2023-11-14 10:20:20 +0300 |
commit | c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch) | |
tree | cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Formats | |
parent | d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff) | |
download | ydb-c2b2dfd9827a400a8495e172a56343462e3ceb82.tar.gz |
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/Formats')
59 files changed, 14407 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Formats/BSONTypes.cpp b/contrib/clickhouse/src/Formats/BSONTypes.cpp new file mode 100644 index 0000000000..88396fd2ab --- /dev/null +++ b/contrib/clickhouse/src/Formats/BSONTypes.cpp @@ -0,0 +1,106 @@ +#include <Formats/BSONTypes.h> +#include <Common/Exception.h> +#include <base/hex.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_TYPE; +} + +static std::string byteToHexString(uint8_t byte) +{ + return "0x" + getHexUIntUppercase(byte); +} + +BSONType getBSONType(uint8_t value) +{ + if ((value >= 0x01 && value <= 0x13) || value == 0xFF || value == 0x7f) + return BSONType(value); + + throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown BSON type: {}", byteToHexString(value)); +} + +BSONBinarySubtype getBSONBinarySubtype(uint8_t value) +{ + if (value <= 0x07) + return BSONBinarySubtype(value); + + throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown BSON binary subtype: {}", byteToHexString(value)); +} + +std::string getBSONTypeName(BSONType type) +{ + switch (type) + { + case BSONType::BINARY: + return "Binary"; + case BSONType::SYMBOL: + return "Symbol"; + case BSONType::ARRAY: + return "Array"; + case BSONType::DOCUMENT: + return "Document"; + case BSONType::TIMESTAMP: + return "Timestamp"; + case BSONType::INT64: + return "Int64"; + case BSONType::INT32: + return "Int32"; + case BSONType::BOOL: + return "Bool"; + case BSONType::DOUBLE: + return "Double"; + case BSONType::STRING: + return "String"; + case BSONType::DECIMAL128: + return "Decimal128"; + case BSONType::JAVA_SCRIPT_CODE_W_SCOPE: + return "JavaScript code w/ scope"; + case BSONType::JAVA_SCRIPT_CODE: + return "JavaScript code"; + case BSONType::DB_POINTER: + return "DBPointer"; + case BSONType::REGEXP: + return "Regexp"; + case BSONType::DATETIME: + return "Datetime"; + case BSONType::OBJECT_ID: + return "ObjectId"; + case BSONType::UNDEFINED: + return "Undefined"; + case BSONType::NULL_VALUE: + return "Null"; + case BSONType::MAX_KEY: + return "Max key"; + case BSONType::MIN_KEY: + return "Min key"; + } +} + +std::string getBSONBinarySubtypeName(BSONBinarySubtype subtype) +{ + switch (subtype) + { + case BSONBinarySubtype::BINARY: + return "Binary"; + case BSONBinarySubtype::FUNCTION: + return "Function"; + case BSONBinarySubtype::BINARY_OLD: + return "Binary (Old)"; + case BSONBinarySubtype::UUID_OLD: + return "UUID (Old)"; + case BSONBinarySubtype::UUID: + return "UUID"; + case BSONBinarySubtype::MD5: + return "MD5"; + case BSONBinarySubtype::ENCRYPTED_BSON_VALUE: + return "Encrypted BSON value"; + case BSONBinarySubtype::COMPRESSED_BSON_COLUMN: + return "Compressed BSON column"; + } +} + +} diff --git a/contrib/clickhouse/src/Formats/BSONTypes.h b/contrib/clickhouse/src/Formats/BSONTypes.h new file mode 100644 index 0000000000..14a3e9decc --- /dev/null +++ b/contrib/clickhouse/src/Formats/BSONTypes.h @@ -0,0 +1,59 @@ +#pragma once + +#include <cstdint> +#include <string> + +namespace DB +{ + +static const uint8_t BSON_DOCUMENT_END = 0x00; +static const size_t BSON_OBJECT_ID_SIZE = 12; +static const size_t BSON_DB_POINTER_SIZE = 12; +using BSONSizeT = uint32_t; +static const BSONSizeT MAX_BSON_SIZE = std::numeric_limits<BSONSizeT>::max(); + +/// See details on https://bsonspec.org/spec.html +enum class BSONType +{ + DOUBLE = 0x01, + STRING = 0x02, + DOCUMENT = 0x03, + ARRAY = 0x04, + BINARY = 0x05, + UNDEFINED = 0x06, + OBJECT_ID = 0x07, + BOOL = 0x08, + DATETIME = 0x09, + NULL_VALUE = 0x0A, + REGEXP = 0x0B, + DB_POINTER = 0x0C, + JAVA_SCRIPT_CODE = 0x0D, + SYMBOL = 0x0E, + JAVA_SCRIPT_CODE_W_SCOPE = 0x0F, + INT32 = 0x10, + TIMESTAMP = 0x11, + INT64 = 0x12, + DECIMAL128 = 0x13, + MIN_KEY = 0xFF, + MAX_KEY = 0x7F, +}; + +enum class BSONBinarySubtype +{ + BINARY = 0x00, + FUNCTION = 0x01, + BINARY_OLD = 0x02, + UUID_OLD = 0x03, + UUID = 0x04, + MD5 = 0x05, + ENCRYPTED_BSON_VALUE = 0x06, + COMPRESSED_BSON_COLUMN = 0x07, +}; + +BSONType getBSONType(uint8_t value); +std::string getBSONTypeName(BSONType type); + +BSONBinarySubtype getBSONBinarySubtype(uint8_t value); +std::string getBSONBinarySubtypeName(BSONBinarySubtype subtype); + +} diff --git a/contrib/clickhouse/src/Formats/CapnProtoSchema.cpp b/contrib/clickhouse/src/Formats/CapnProtoSchema.cpp new file mode 100644 index 0000000000..1a10f867ef --- /dev/null +++ b/contrib/clickhouse/src/Formats/CapnProtoSchema.cpp @@ -0,0 +1,298 @@ +#include <Formats/CapnProtoSchema.h> + +#if USE_CAPNP + +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeEnum.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/IDataType.h> +#include <Common/StringUtils/StringUtils.h> +#include <boost/algorithm/string/join.hpp> +#error #include <capnp/schema.h> +#error #include <capnp/schema-parser.h> +#include <fcntl.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA; + extern const int BAD_TYPE_OF_FIELD; + extern const int FILE_DOESNT_EXIST; + extern const int UNKNOWN_EXCEPTION; + extern const int CAPN_PROTO_BAD_TYPE; + extern const int BAD_ARGUMENTS; +} + +capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) +{ + capnp::ParsedSchema schema; + try + { + int fd; + KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon) + auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd)); + schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {}); + } + catch (const kj::Exception & e) + { + /// That's not good to determine the type of error by its description, but + /// this is the only way to do it here, because kj doesn't specify the type of error. + auto description = std::string_view(e.getDescription().cStr()); + if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos || description.find("no such file") != String::npos) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath()); + + if (description.find("Parse error") != String::npos) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine()); + + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}", + description, schema_info.schemaDirectory(), schema_info.schemaPath()); + } + + auto message_maybe = schema.findNested(schema_info.messageName()); + auto * message_schema = kj::_::readMaybe(message_maybe); + if (!message_schema) + throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, + "CapnProto schema doesn't contain message with name {}", schema_info.messageName()); + return message_schema->asStruct(); +} + +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size(); +} + +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema) +{ + return struct_schema.getFields().size() == struct_schema.getUnionFields().size(); +} + +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type) +{ + static const std::map<capnp::schema::Type::Which, String> capnp_simple_type_names = + { + {capnp::schema::Type::Which::BOOL, "Bool"}, + {capnp::schema::Type::Which::VOID, "Void"}, + {capnp::schema::Type::Which::INT8, "Int8"}, + {capnp::schema::Type::Which::INT16, "Int16"}, + {capnp::schema::Type::Which::INT32, "Int32"}, + {capnp::schema::Type::Which::INT64, "Int64"}, + {capnp::schema::Type::Which::UINT8, "UInt8"}, + {capnp::schema::Type::Which::UINT16, "UInt16"}, + {capnp::schema::Type::Which::UINT32, "UInt32"}, + {capnp::schema::Type::Which::UINT64, "UInt64"}, + {capnp::schema::Type::Which::FLOAT32, "Float32"}, + {capnp::schema::Type::Which::FLOAT64, "Float64"}, + {capnp::schema::Type::Which::TEXT, "Text"}, + {capnp::schema::Type::Which::DATA, "Data"}, + {capnp::schema::Type::Which::INTERFACE, "Interface"}, + {capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"}, + }; + + switch (type.which()) + { + case capnp::schema::Type::Which::STRUCT: + { + auto struct_schema = type.asStruct(); + + auto non_union_fields = struct_schema.getNonUnionFields(); + std::vector<String> non_union_field_names; + for (auto nested_field : non_union_fields) + non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + auto union_fields = struct_schema.getUnionFields(); + std::vector<String> union_field_names; + for (auto nested_field : union_fields) + union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType())); + + String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")"; + /// Check if the struct is a named union. + if (non_union_field_names.empty()) + return union_name; + + String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", "); + /// Check if the struct contains unnamed union. + if (!union_field_names.empty()) + type_name += ", " + union_name; + type_name += ")"; + return type_name; + } + case capnp::schema::Type::Which::LIST: + return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")"; + case capnp::schema::Type::Which::ENUM: + { + auto enum_schema = type.asEnum(); + String enum_name = "Enum("; + auto enumerants = enum_schema.getEnumerants(); + for (unsigned i = 0; i != enumerants.size(); ++i) + { + enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal()); + if (i + 1 != enumerants.size()) + enum_name += ", "; + } + enum_name += ")"; + return enum_name; + } + default: + auto it = capnp_simple_type_names.find(type.which()); + if (it == capnp_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type"); + return it->second; + } +} + +namespace +{ + + template <typename ValueType> + DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) + { + std::vector<std::pair<String, ValueType>> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared<DataTypeEnum<ValueType>>(std::move(values)); + } + + DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) + { + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants<Int8>(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants<Int16>(enumerants); + + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); + } + + DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared<DataTypeInt8>(); + case capnp::schema::Type::INT16: + return std::make_shared<DataTypeInt16>(); + case capnp::schema::Type::INT32: + return std::make_shared<DataTypeInt32>(); + case capnp::schema::Type::INT64: + return std::make_shared<DataTypeInt64>(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared<DataTypeUInt8>(); + case capnp::schema::Type::UINT16: + return std::make_shared<DataTypeUInt16>(); + case capnp::schema::Type::UINT32: + return std::make_shared<DataTypeUInt32>(); + case capnp::schema::Type::UINT64: + return std::make_shared<DataTypeUInt64>(); + case capnp::schema::Type::FLOAT32: + return std::make_shared<DataTypeFloat32>(); + case capnp::schema::Type::FLOAT64: + return std::make_shared<DataTypeFloat64>(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared<DataTypeString>(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared<DataTypeArray>(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + + if (struct_schema.getFields().size() == 0) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); + } + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); + } + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); + } + + auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); + if (!nested_type) + return nullptr; + return std::make_shared<DataTypeNullable>(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (!nested_type) + continue; + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(nested_type); + } + if (nested_types.empty()) + return nullptr; + return std::make_shared<DataTypeTuple>(std::move(nested_types), std::move(nested_names)); + } + default: + { + if (skip_unsupported_fields) + return nullptr; + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } + } +} + +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (type) + names_and_types.emplace_back(name, type); + } + if (names_and_types.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); + + return names_and_types; +} + +} + +#endif diff --git a/contrib/clickhouse/src/Formats/CapnProtoSchema.h b/contrib/clickhouse/src/Formats/CapnProtoSchema.h new file mode 100644 index 0000000000..b446b12688 --- /dev/null +++ b/contrib/clickhouse/src/Formats/CapnProtoSchema.h @@ -0,0 +1,43 @@ +#pragma once + +#include "clickhouse_config.h" +#if USE_CAPNP + +#include <Formats/FormatSchemaInfo.h> +#include <Formats/FormatSettings.h> +#include <Core/Block.h> +#error #include <capnp/schema-parser.h> +#error #include <capnp/dynamic.h> + +namespace DB +{ +// Wrapper for classes that could throw in destructor +// https://github.com/capnproto/capnproto/issues/553 +template <typename T> +struct DestructorCatcher +{ + T impl; + template <typename ... Arg> + explicit DestructorCatcher(Arg && ... args) : impl(kj::fwd<Arg>(args)...) {} + ~DestructorCatcher() noexcept try { } catch (...) { return; } +}; + +class CapnProtoSchemaParser : public DestructorCatcher<capnp::SchemaParser> +{ +public: + CapnProtoSchemaParser() = default; + + capnp::StructSchema getMessageSchema(const FormatSchemaInfo & schema_info); +}; + +bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema); +bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema); + +/// Get full name of type for better exception messages. +String getCapnProtoFullTypeName(const capnp::Type & type); + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields); + +} + +#endif diff --git a/contrib/clickhouse/src/Formats/CapnProtoSerializer.cpp b/contrib/clickhouse/src/Formats/CapnProtoSerializer.cpp new file mode 100644 index 0000000000..1be73aa4ad --- /dev/null +++ b/contrib/clickhouse/src/Formats/CapnProtoSerializer.cpp @@ -0,0 +1,1538 @@ +#include "clickhouse_config.h" + +#if USE_CAPNP + +#include <Formats/CapnProtoSerializer.h> +#include <Formats/FormatSettings.h> +#include <Formats/CapnProtoSchema.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeEnum.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/IDataType.h> +#include <Columns/ColumnArray.h> +#include <Columns/ColumnNullable.h> +#include <Columns/ColumnString.h> +#include <Columns/ColumnFixedString.h> +#include <Columns/ColumnTuple.h> +#include <Columns/ColumnLowCardinality.h> +#include <Columns/ColumnsDateTime.h> +#include <Columns/ColumnMap.h> + +#include <boost/algorithm/string.hpp> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int THERE_IS_NO_COLUMN; + extern const int CAPN_PROTO_BAD_CAST; + extern const int INCORRECT_DATA; + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + std::pair<String, String> splitFieldName(const String & name) + { + const auto * begin = name.data(); + const auto * end = name.data() + name.size(); + const auto * it = find_first_symbols<'_', '.'>(begin, end); + String first = String(begin, it); + String second = it == end ? "" : String(it + 1, end); + return {first, second}; + } + + std::optional<capnp::StructSchema::Field> findFieldByName(const capnp::StructSchema & struct_schema, const String & name) + { + const auto & fields = struct_schema.getFields(); + for (auto field : fields) + { + auto field_name = String(field.getProto().getName()); + if (boost::to_lower_copy(name) == boost::to_lower_copy(field_name)) + return field; + } + return std::nullopt; + } + + [[noreturn]] void throwCannotConvert(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type) + { + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}", + name, + type->getName(), + getCapnProtoFullTypeName(capnp_type)); + } + + struct FieldBuilder + { + virtual ~FieldBuilder() = default; + }; + + struct ListBuilder : public FieldBuilder + { + explicit ListBuilder(capnp::DynamicValue::Builder builder, UInt32 elements_size) : impl(builder.as<capnp::DynamicList>()), nested_builders(elements_size) + { + } + + capnp::DynamicList::Builder impl; + std::vector<std::unique_ptr<FieldBuilder>> nested_builders; + }; + + struct StructBuilder : public FieldBuilder + { + explicit StructBuilder(capnp::DynamicStruct::Builder struct_builder, size_t fields_size) : impl(std::move(struct_builder)), field_builders(fields_size) + { + } + + capnp::DynamicStruct::Builder impl; + std::vector<std::unique_ptr<FieldBuilder>> field_builders; + }; + + template <typename ParentBuilder> + std::unique_ptr<StructBuilder> initStructBuilder(ParentBuilder & parent_builder, UInt32 offset_or_index, const capnp::_::StructSize & struct_size, size_t elements, const capnp::StructSchema & schema) + { + capnp::DynamicStruct::Builder builder_impl; + if constexpr (std::is_same_v<ParentBuilder, capnp::DynamicStruct::Builder>) + builder_impl = capnp::DynamicStruct::Builder(schema, parent_builder.getBuilderImpl().getPointerField(offset_or_index).initStruct(struct_size)); + else + builder_impl = capnp::DynamicStruct::Builder(schema, parent_builder.getBuilderImpl().getStructElement(offset_or_index)); + return std::make_unique<StructBuilder>(std::move(builder_impl), elements); + } + + class ICapnProtoSerializer + { + public: + /// Write row as struct field. + virtual void writeRow( + const ColumnPtr & column, + std::unique_ptr<FieldBuilder> & builder, /// Maybe unused for simple types, needed to initialize structs and lists. + capnp::DynamicStruct::Builder & parent_struct_builder, + UInt32 slot_offset, + size_t row_num) = 0; + + /// Write row as list element. + virtual void writeRow( + const ColumnPtr & column, + std::unique_ptr<FieldBuilder> & builder, /// Maybe unused for simple types, needed to initialize structs and lists. + capnp::DynamicList::Builder & parent_list_builder, + UInt32 array_index, + size_t row_num) = 0; + + /// Read row from struct field at slot_offset. + virtual void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) = 0; + + /// Read row from list element at array_index. + virtual void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) = 0; + + virtual ~ICapnProtoSerializer() = default; + }; + + template <typename CHNumericType, typename CapnProtoNumericType, bool convert_to_bool_on_read> + class CapnProtoIntegerSerializer : public ICapnProtoSerializer + { + public: + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<CapnProtoNumericType>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<CapnProtoNumericType>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<CapnProtoNumericType>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<CapnProtoNumericType>(array_index)); + } + + private: + CapnProtoNumericType getValue(const ColumnPtr & column, size_t row_num) + { + return static_cast<CapnProtoNumericType>(assert_cast<const ColumnVector<CHNumericType> &>(*column).getElement(row_num)); + } + + void insertValue(IColumn & column, CapnProtoNumericType value) + { + if constexpr (convert_to_bool_on_read) + assert_cast<ColumnUInt8 &>(column).insertValue(static_cast<bool>(value)); + else + assert_cast<ColumnVector<CHNumericType> &>(column).insertValue(static_cast<CHNumericType>(value)); + } + }; + + template <typename NumericType, bool convert_to_bool_on_read = false> + std::unique_ptr<ICapnProtoSerializer> createIntegerSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, Int8, convert_to_bool_on_read>>(); + case capnp::schema::Type::INT16: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, Int16, convert_to_bool_on_read>>(); + case capnp::schema::Type::INT32: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, Int32, convert_to_bool_on_read>>(); + case capnp::schema::Type::INT64: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, Int64, convert_to_bool_on_read>>(); + case capnp::schema::Type::UINT8: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, UInt8, convert_to_bool_on_read>>(); + case capnp::schema::Type::UINT16: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, UInt16, convert_to_bool_on_read>>(); + case capnp::schema::Type::UINT32: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, UInt32, convert_to_bool_on_read>>(); + case capnp::schema::Type::UINT64: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, UInt64, convert_to_bool_on_read>>(); + case capnp::schema::Type::BOOL: + return std::make_unique<CapnProtoIntegerSerializer<NumericType, bool, convert_to_bool_on_read>>(); + default: + throwCannotConvert(data_type, column_name, capnp_type); + } + } + + template <typename CHFloatType, typename CapnProtoFloatType> + class CapnProtoFloatSerializer : public ICapnProtoSerializer + { + public: + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<CapnProtoFloatType>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<CapnProtoFloatType>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<CapnProtoFloatType>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<CapnProtoFloatType>(array_index)); + } + + private: + CapnProtoFloatType getValue(const ColumnPtr & column, size_t row_num) + { + return static_cast<CapnProtoFloatType>(assert_cast<const ColumnVector<CHFloatType> &>(*column).getElement(row_num)); + } + + void insertValue(IColumn & column, CapnProtoFloatType value) + { + assert_cast<ColumnVector<CHFloatType> &>(column).insertValue(static_cast<CHFloatType>(value)); + } + }; + + template <typename FloatType> + std::unique_ptr<ICapnProtoSerializer> createFloatSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + switch (capnp_type.which()) + { + case capnp::schema::Type::FLOAT32: + return std::make_unique<CapnProtoFloatSerializer<FloatType, Float32>>(); + case capnp::schema::Type::FLOAT64: + return std::make_unique<CapnProtoFloatSerializer<FloatType, Float64>>(); + default: + throwCannotConvert(data_type, column_name, capnp_type); + } + } + + template <typename EnumType> + class CapnProtoEnumSerializer : public ICapnProtoSerializer + { + public: + CapnProtoEnumSerializer( + const DataTypePtr & data_type_, + const String & column_name, + const capnp::Type & capnp_type, + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode_) : data_type(data_type_), enum_comparing_mode(enum_comparing_mode_) + { + if (!capnp_type.isEnum()) + throwCannotConvert(data_type, column_name, capnp_type); + + const auto * enum_type = assert_cast<const DataTypeEnum<EnumType> *>(data_type.get()); + const auto & enum_values = dynamic_cast<const EnumValues<EnumType> &>(*enum_type); + + enum_schema = capnp_type.asEnum(); + auto enumerants = enum_schema.getEnumerants(); + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + { + auto ch_enum_values = enum_values.getSetOfAllValues(); + std::unordered_set<UInt16> capn_enum_values; + for (auto enumerant : enumerants) + capn_enum_values.insert(enumerant.getOrdinal()); + + /// Check if ClickHouse values is a superset of CapnProto values. + ch_enum_is_superset = true; + /// In CapnProto Enum fields are numbered sequentially starting from zero. + /// Check if max CapnProto value exceeds max ClickHouse value. + constexpr auto max_value = std::is_same_v<EnumType, Int8> ? INT8_MAX : INT16_MAX; + if (enumerants.size() > max_value) + { + ch_enum_is_superset = false; + } + else + { + for (auto capnp_value : capn_enum_values) + { + if (!ch_enum_values.contains(static_cast<EnumType>(capnp_value))) + { + ch_enum_is_superset = false; + break; + } + } + } + + /// Check if CapnProto values is a superset of ClickHouse values. + capnp_enum_is_superset = true; + for (auto ch_value : ch_enum_values) + { + /// Capnp doesn't support negative enum values. + if (ch_value < 0 || !capn_enum_values.contains(static_cast<UInt16>(ch_value))) + { + capnp_enum_is_superset = false; + break; + } + } + } + else + { + bool to_lower = enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE; + + auto all_values = enum_values.getValues(); + std::unordered_map<String, EnumType> ch_name_to_value; + for (auto & [name, value] : all_values) + ch_name_to_value[to_lower ? boost::algorithm::to_lower_copy(name) : name] = value; + + std::unordered_map<String, UInt16> capnp_name_to_value; + for (auto enumerant : enumerants) + { + String capnp_name = enumerant.getProto().getName(); + capnp_name_to_value[to_lower ? boost::algorithm::to_lower_copy(capnp_name) : capnp_name] = enumerant.getOrdinal(); + } + + /// Check if ClickHouse names is a superset of CapnProto names. + ch_enum_is_superset = true; + for (auto & [capnp_name, capnp_value] : capnp_name_to_value) + { + auto it = ch_name_to_value.find(capnp_name); + if (it == ch_name_to_value.end()) + { + ch_enum_is_superset = false; + break; + } + capnp_to_ch_values[capnp_value] = it->second; + } + + /// Check if CapnProto names is a superset of ClickHouse names. + capnp_enum_is_superset = true; + + for (auto & [ch_name, ch_value] : ch_name_to_value) + { + auto it = capnp_name_to_value.find(ch_name); + if (it == capnp_name_to_value.end()) + { + capnp_enum_is_superset = false; + break; + } + ch_to_capnp_values[ch_value] = it->second; + } + } + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<UInt16>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<UInt16>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<UInt16>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<UInt16>(array_index)); + } + + private: + UInt16 getValue(const ColumnPtr & column, size_t row_num) + { + if (!capnp_enum_is_superset) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Cannot convert ClickHouse enum to CapnProto enum: CapnProto enum values/names is not a superset of ClickHouse enum values/names"); + + EnumType enum_value = assert_cast<const ColumnVector<EnumType> &>(*column).getElement(row_num); + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + return static_cast<UInt16>(enum_value); + auto it = ch_to_capnp_values.find(enum_value); + if (it == ch_to_capnp_values.end()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected value {} in ClickHouse enum", enum_value); + + return it->second; + } + + void insertValue(IColumn & column, UInt16 capnp_enum_value) + { + if (!ch_enum_is_superset) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Cannot convert CapnProto enum to ClickHouse enum: ClickHouse enum values/names is not a superset of CapnProto enum values/names"); + + if (enum_comparing_mode == FormatSettings::CapnProtoEnumComparingMode::BY_VALUES) + { + assert_cast<ColumnVector<EnumType> &>(column).insertValue(static_cast<EnumType>(capnp_enum_value)); + } + else + { + auto it = capnp_to_ch_values.find(capnp_enum_value); + if (it == capnp_to_ch_values.end()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected value {} in CapnProto enum", capnp_enum_value); + + assert_cast<ColumnVector<EnumType> &>(column).insertValue(it->second); + } + } + + DataTypePtr data_type; + capnp::EnumSchema enum_schema; + const FormatSettings::CapnProtoEnumComparingMode enum_comparing_mode; + bool ch_enum_is_superset; + bool capnp_enum_is_superset; + std::unordered_map<EnumType, UInt16> ch_to_capnp_values; + std::unordered_map<UInt16, EnumType> capnp_to_ch_values; + }; + + class CapnProtoDateSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt16()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<UInt16>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<UInt16>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<UInt16>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<UInt16>(array_index)); + } + + private: + UInt16 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast<const ColumnDate &>(*column).getElement(row_num); + } + + void insertValue(IColumn & column, UInt16 value) + { + assert_cast<ColumnDate &>(column).insertValue(value); + } + }; + + class CapnProtoDate32Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDate32Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<Int32>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<Int32>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<Int32>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<Int32>(array_index)); + } + + private: + Int32 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast<const ColumnDate32 &>(*column).getElement(row_num); + } + + void insertValue(IColumn & column, Int32 value) + { + assert_cast<ColumnDate32 &>(column).insertValue(value); + } + }; + + class CapnProtoDateTimeSerializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTimeSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<UInt32>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<UInt32>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<UInt32>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<UInt32>(array_index)); + } + + private: + UInt32 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast<const ColumnDateTime &>(*column).getElement(row_num); + } + + void insertValue(IColumn & column, UInt32 value) + { + assert_cast<ColumnDateTime &>(column).insertValue(value); + } + }; + + class CapnProtoDateTime64Serializer : public ICapnProtoSerializer + { + public: + CapnProtoDateTime64Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isInt64()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<Int64>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<Int64>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<Int64>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<Int64>(array_index)); + } + + private: + Int64 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast<const ColumnDateTime64 &>(*column).getElement(row_num); + } + + void insertValue(IColumn & column, Int64 value) + { + assert_cast<ColumnDateTime64 &>(column).insertValue(value); + } + }; + + template <typename DecimalType> + class CapnProtoDecimalSerializer : public ICapnProtoSerializer + { + public: + using NativeType = typename DecimalType::NativeType; + + CapnProtoDecimalSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + auto which = WhichDataType(data_type); + if ((!capnp_type.isInt32() && which.isDecimal32()) || (!capnp_type.isInt64() && which.isDecimal64())) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<NativeType>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<NativeType>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<NativeType>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<NativeType>(array_index)); + } + + private: + NativeType getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast<const ColumnDecimal<DecimalType> &>(*column).getElement(row_num); + } + + void insertValue(IColumn & column, NativeType value) + { + assert_cast<ColumnDecimal<DecimalType> &>(column).insertValue(value); + } + }; + + + class CapnProtoIPv4Serializer : public ICapnProtoSerializer + { + public: + CapnProtoIPv4Serializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isUInt32()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().setDataField<UInt32>(slot_offset, getValue(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + parent_list_builder.getBuilderImpl().setDataElement<UInt32>(array_index, getValue(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertValue(column, parent_struct_reader.getReaderImpl().getDataField<UInt32>(slot_offset)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertValue(column, parent_list_reader.getReaderImpl().getDataElement<UInt32>(array_index)); + } + + private: + UInt32 getValue(const ColumnPtr & column, size_t row_num) + { + return assert_cast<const ColumnIPv4 &>(*column).getElement(row_num); + } + + void insertValue(IColumn & column, UInt32 value) + { + assert_cast<ColumnIPv4 &>(column).insertValue(IPv4(value)); + } + }; + + template <typename T> + class CapnProtoFixedSizeRawDataSerializer : public ICapnProtoSerializer + { + private: + static constexpr size_t expected_value_size = sizeof(T); + + public: + CapnProtoFixedSizeRawDataSerializer(const DataTypePtr & data_type_, const String & column_name, const capnp::Type & capnp_type) : data_type(data_type_) + { + if (!capnp_type.isData()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerField(slot_offset).setBlob<capnp::Data>(getData(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerElement(array_index).setBlob<capnp::Data>(getData(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertData(column, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getBlob<capnp::Data>(nullptr, 0)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertData(column, parent_list_reader.getReaderImpl().getPointerElement(array_index).getBlob<capnp::Data>(nullptr, 0)); + } + + private: + capnp::Data::Reader getData(const ColumnPtr & column, size_t row_num) + { + auto data = column->getDataAt(row_num); + return capnp::Data::Reader(reinterpret_cast<const kj::byte *>(data.data), data.size); + } + + void insertData(IColumn & column, capnp::Data::Reader data) + { + if (data.size() != expected_value_size) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", data_type->getName(), data.size()); + + column.insertData(reinterpret_cast<const char *>(data.begin()), data.size()); + } + + DataTypePtr data_type; + }; + + template <typename CapnpType> + class CapnProtoStringSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerField(slot_offset).setBlob<CapnpType>(getData(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerElement(array_index).setBlob<CapnpType>(getData(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertData(column, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getBlob<CapnpType>(nullptr, 0)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertData(column, parent_list_reader.getReaderImpl().getPointerElement(array_index).getBlob<CapnpType>(nullptr, 0)); + } + + private: + using Reader = typename CapnpType::Reader; + + Reader getData(const ColumnPtr & column, size_t row_num) + { + auto data = column->getDataAt(row_num); + if constexpr (std::is_same_v<CapnpType, capnp::Data>) + return Reader(reinterpret_cast<const kj::byte *>(data.data), data.size); + else + return Reader(data.data, data.size); + } + + void insertData(IColumn & column, Reader data) + { + column.insertData(reinterpret_cast<const char *>(data.begin()), data.size()); + } + }; + + template <typename CapnpType> + class CapnProtoFixedStringSerializer : public ICapnProtoSerializer + { + private: + + public: + CapnProtoFixedStringSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type_) : capnp_type(capnp_type_) + { + if (!capnp_type.isData() && !capnp_type.isText()) + throwCannotConvert(data_type, column_name, capnp_type); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerField(slot_offset).setBlob<CapnpType>(getData(column, row_num)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> &, capnp::DynamicList::Builder & parent_struct_builder, UInt32 array_index, size_t row_num) override + { + parent_struct_builder.getBuilderImpl().getPointerElement(array_index).setBlob<CapnpType>(getData(column, row_num)); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + insertData(column, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getBlob<CapnpType>(nullptr, 0)); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + insertData(column, parent_list_reader.getReaderImpl().getPointerElement(array_index).getBlob<CapnpType>(nullptr, 0)); + } + + private: + using Reader = typename CapnpType::Reader; + + Reader getData(const ColumnPtr & column, size_t row_num) + { + auto data = column->getDataAt(row_num); + if constexpr (std::is_same_v<CapnpType, capnp::Data>) + { + return Reader(reinterpret_cast<const kj::byte *>(data.data), data.size); + } + else + { + if (data.data[data.size - 1] == 0) + return Reader(data.data, data.size); + + /// In TEXT type data should be null-terminated, but ClickHouse FixedString data could not be. + /// To make data null-terminated we should copy it to temporary String object and use it in capnp::Text::Reader. + /// Note that capnp::Text::Reader works only with pointer to the data and it's size, so we should + /// guarantee that new String object life time is longer than capnp::Text::Reader life time. + tmp_string = data.toString(); + return Reader(tmp_string.data(), tmp_string.size()); + } + } + + void insertData(IColumn & column, Reader data) + { + auto & fixed_string_column = assert_cast<ColumnFixedString &>(column); + if (data.size() > fixed_string_column.getN()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot read data with size {} to FixedString with size {}", data.size(), fixed_string_column.getN()); + + fixed_string_column.insertData(reinterpret_cast<const char *>(data.begin()), data.size()); + } + + String tmp_string; + capnp::Type capnp_type; + }; + + std::unique_ptr<ICapnProtoSerializer> createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings); + + class CapnProtoLowCardinalitySerializer : public ICapnProtoSerializer + { + public: + CapnProtoLowCardinalitySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + nested_serializer = createSerializer(assert_cast<const DataTypeLowCardinality &>(*data_type).getDictionaryType(), column_name, capnp_type, settings); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + readRowImpl(column, parent_struct_reader, slot_offset); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + readRowImpl(column, parent_list_reader, array_index); + } + + private: + template <typename ParentBuilder> + void writeRowImpl(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) + { + const auto & low_cardinality_column = assert_cast<const ColumnLowCardinality &>(*column); + size_t index = low_cardinality_column.getIndexAt(row_num); + const auto & dict_column = low_cardinality_column.getDictionary().getNestedColumn(); + nested_serializer->writeRow(dict_column, field_builder, parent_builder, offset_or_index, index); + } + + template <typename ParentReader> + void readRowImpl(IColumn & column, const ParentReader & parent_reader, UInt32 offset_or_index) + { + auto & low_cardinality_column = assert_cast<ColumnLowCardinality &>(column); + auto tmp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + nested_serializer->readRow(*tmp_column, parent_reader, offset_or_index); + low_cardinality_column.insertFromFullColumn(*tmp_column, 0); + } + + std::unique_ptr<ICapnProtoSerializer> nested_serializer; + }; + + class CapnProtoNullableSerializer : public ICapnProtoSerializer + { + public: + CapnProtoNullableSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type, got CapnProto type {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + /// Check that struct is a named union of type VOID and one arbitrary type. + struct_schema = capnp_type.asStruct(); + auto node = struct_schema.getProto().getStruct(); + struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); + discriminant_offset = node.getDiscriminantOffset(); + if (!checkIfStructIsNamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto struct is not a named union: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto union_fields = struct_schema.getUnionFields(); + if (union_fields.size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union have more than 2 fields: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + auto first = union_fields[0]; + auto second = union_fields[1]; + auto nested_type = assert_cast<const DataTypeNullable *>(data_type.get())->getNestedType(); + nested_slot_offset = first.getProto().getSlot().getOffset(); /// Both fields have the same offset. + if (first.getType().isVoid()) + { + nested_serializer = createSerializer(nested_type, column_name, second.getType(), settings); + null_discriminant = 0; + nested_discriminant = 1; + } + else if (second.getType().isVoid()) + { + nested_serializer = createSerializer(nested_type, column_name, first.getType(), settings); + null_discriminant = 1; + nested_discriminant = 0; + } + else + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert column \"{}\": Nullable can be represented only as a named union of type Void and nested type." + "Given CapnProto union doesn't have field with type Void: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getStruct(nullptr)); + readRowImpl(column, struct_reader); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_list_reader.getReaderImpl().getStructElement(array_index)); + readRowImpl(column, struct_reader); + } + + private: + template <typename ParentBuilder> + void writeRowImpl(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) + { + if (!field_builder) + field_builder = initStructBuilder(parent_builder, offset_or_index, struct_size, 1, struct_schema); + + auto & struct_builder = assert_cast<StructBuilder &>(*field_builder); + + const auto & nullable_column = assert_cast<const ColumnNullable &>(*column); + if (nullable_column.isNullAt(row_num)) + { + auto struct_builder_impl = struct_builder.impl.getBuilderImpl(); + struct_builder_impl.setDataField<uint16_t>(discriminant_offset, null_discriminant); + struct_builder_impl.setDataField<capnp::Void>(nested_slot_offset, capnp::Void()); + } + else + { + const auto & nested_column = nullable_column.getNestedColumnPtr(); + struct_builder.impl.getBuilderImpl().setDataField<uint16_t>(discriminant_offset, nested_discriminant); + nested_serializer->writeRow(nested_column, struct_builder.field_builders[0], struct_builder.impl, nested_slot_offset, row_num); + } + } + + void readRowImpl(IColumn & column, capnp::DynamicStruct::Reader & struct_reader) + { + auto & nullable_column = assert_cast<ColumnNullable &>(column); + auto discriminant = struct_reader.getReaderImpl().getDataField<uint16_t>(discriminant_offset); + + if (discriminant == null_discriminant) + nullable_column.insertDefault(); + else + { + auto & nested_column = nullable_column.getNestedColumn(); + nested_serializer->readRow(nested_column, struct_reader, nested_slot_offset); + nullable_column.getNullMapData().push_back(0); + } + } + + + std::unique_ptr<ICapnProtoSerializer> nested_serializer; + capnp::StructSchema struct_schema; + capnp::_::StructSize struct_size; + UInt32 discriminant_offset; + UInt16 null_discriminant; + UInt16 nested_discriminant; + UInt32 nested_slot_offset; + }; + + class CapnProtoArraySerializer : public ICapnProtoSerializer + { + public: + CapnProtoArraySerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isList()) + throwCannotConvert(data_type, column_name, capnp_type); + + auto nested_type = assert_cast<const DataTypeArray *>(data_type.get())->getNestedType(); + list_schema = capnp_type.asList(); + auto element_type = list_schema.getElementType(); + element_size = capnp::elementSizeFor(element_type.which()); + if (element_type.isStruct()) + { + element_is_struct = true; + auto node = element_type.asStruct().getProto().getStruct(); + element_struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); + } + + nested_serializer = createSerializer(nested_type, column_name, capnp_type.asList().getElementType(), settings); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto list_reader = capnp::DynamicList::Reader(list_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getList(element_size, nullptr)); + readRowImpl(column, list_reader); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto list_reader = capnp::DynamicList::Reader(list_schema, parent_list_reader.getReaderImpl().getPointerElement(array_index).getList(element_size, nullptr)); + readRowImpl(column, list_reader); + } + + private: + template <typename ParentBuilder> + void writeRowImpl(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) + { + const auto * array_column = assert_cast<const ColumnArray *>(column.get()); + const auto & nested_column = array_column->getDataPtr(); + const auto & offsets = array_column->getOffsets(); + auto offset = offsets[row_num - 1]; + UInt32 size = static_cast<UInt32>(offsets[row_num] - offset); + + if (!field_builder) + field_builder = std::make_unique<ListBuilder>(capnp::DynamicList::Builder(list_schema, initListBuilder(parent_builder, offset_or_index, size)), size); + + auto & list_builder = assert_cast<ListBuilder &>(*field_builder); + for (UInt32 i = 0; i != size; ++i) + nested_serializer->writeRow(nested_column, list_builder.nested_builders[i], list_builder.impl, i, offset + i); + } + + template <typename ParentBuilder> + capnp::_::ListBuilder initListBuilder(ParentBuilder & parent_builder, UInt32 offset_or_index, UInt32 size) + { + if (element_is_struct) + { + if constexpr (std::is_same_v<ParentBuilder, capnp::DynamicStruct::Builder>) + return parent_builder.getBuilderImpl().getPointerField(offset_or_index).initStructList(size, element_struct_size); + else + return parent_builder.getBuilderImpl().getPointerElement(offset_or_index).initStructList(size, element_struct_size); + } + + if constexpr (std::is_same_v<ParentBuilder, capnp::DynamicStruct::Builder>) + return parent_builder.getBuilderImpl().getPointerField(offset_or_index).initList(element_size, size); + else + return parent_builder.getBuilderImpl().getPointerElement(offset_or_index).initList(element_size, size); + } + + void readRowImpl(IColumn & column, const capnp::DynamicList::Reader & list_reader) + { + UInt32 size = list_reader.size(); + auto & column_array = assert_cast<ColumnArray &>(column); + auto & offsets = column_array.getOffsets(); + offsets.push_back(offsets.back() + list_reader.size()); + + auto & nested_column = column_array.getData(); + for (UInt32 i = 0; i != size; ++i) + nested_serializer->readRow(nested_column, list_reader, i); + } + + capnp::ListSchema list_schema; + std::unique_ptr<ICapnProtoSerializer> nested_serializer; + capnp::ElementSize element_size; + capnp::_::StructSize element_struct_size; + bool element_is_struct = false; + + }; + + class CapnProtoMapSerializer : public ICapnProtoSerializer + { + public: + CapnProtoMapSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + /// We output/input Map type as follow CapnProto schema + /// + /// struct Map { + /// struct Entry { + /// key @0: Key; + /// value @1: Value; + /// } + /// entries @0 :List(Entry); + /// } + + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + struct_schema = capnp_type.asStruct(); + auto node = struct_schema.getProto().getStruct(); + struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto Struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + if (struct_schema.getFields().size() != 1) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got struct: {}", + column_name, + getCapnProtoFullTypeName(capnp_type)); + + const auto & field_type = struct_schema.getFields()[0].getType(); + if (!field_type.isList()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Map type can be represented as a Struct with one list field, got field: {}", + column_name, + getCapnProtoFullTypeName(field_type)); + + auto list_element_type = field_type.asList().getElementType(); + if (!list_element_type.isStruct()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": Field of struct that represents Map should be a list of structs, got list of {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + auto key_value_struct = list_element_type.asStruct(); + if (checkIfStructContainsUnnamedUnion(key_value_struct)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries is unnamed union: {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + if (key_value_struct.getFields().size() != 2) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\": struct that represents Map entries should contain only 2 fields, got struct {}", + column_name, + getCapnProtoFullTypeName(list_element_type)); + + const auto & map_type = assert_cast<const DataTypeMap &>(*data_type); + DataTypes types = {map_type.getKeyType(), map_type.getValueType()}; + Names names = {"key", "value"}; + auto entries_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(types, names)); + nested_serializer = createSerializer(entries_type, column_name, field_type, settings); + entries_slot_offset = struct_schema.getFields()[0].getProto().getSlot().getOffset(); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getStruct(nullptr)); + readRowImpl(column, struct_reader); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_list_reader.getReaderImpl().getStructElement(array_index)); + readRowImpl(column, struct_reader); + } + + private: + template <typename ParentBuilder> + void writeRowImpl(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) + { + if (!field_builder) + field_builder = initStructBuilder(parent_builder, offset_or_index, struct_size, 1, struct_schema); + + auto & struct_builder = assert_cast<StructBuilder &>(*field_builder); + const auto & entries_column = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr(); + nested_serializer->writeRow(entries_column, struct_builder.field_builders[0], struct_builder.impl, entries_slot_offset, row_num); + } + + void readRowImpl(IColumn & column, const capnp::DynamicStruct::Reader & struct_reader) + { + auto & entries_column = assert_cast<ColumnMap &>(column).getNestedColumn(); + nested_serializer->readRow(entries_column, struct_reader, entries_slot_offset); + } + + std::unique_ptr<ICapnProtoSerializer> nested_serializer; + capnp::StructSchema struct_schema; + capnp::_::StructSize struct_size; + UInt32 entries_slot_offset; + }; + + class CapnProtoStructureSerializer : public ICapnProtoSerializer + { + public: + CapnProtoStructureSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) : struct_schema(schema) + { + if (checkIfStructIsNamedUnion(schema) || checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Root CapnProto Struct cannot be named union/struct with unnamed union"); + + initialize(data_types, names, settings); + } + + CapnProtoStructureSerializer(const DataTypePtr & data_type, const String & column_name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + if (!capnp_type.isStruct()) + throwCannotConvert(data_type, column_name, capnp_type); + + struct_schema = capnp_type.asStruct(); + + if (checkIfStructIsNamedUnion(struct_schema) || checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto named union/struct with unnamed union {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type)); + + const auto * tuple_data_type = assert_cast<const DataTypeTuple *>(data_type.get()); + auto nested_types = tuple_data_type->getElements(); + Names nested_names; + bool have_explicit_names = tuple_data_type->haveExplicitNames(); + auto structure_fields = struct_schema.getFields(); + if (!have_explicit_names) + { + if (nested_types.size() != structure_fields.size()) + throw Exception( + ErrorCodes::CAPN_PROTO_BAD_CAST, + "Cannot convert ClickHouse column \"{}\" with type {} to CapnProto type {}: Tuple and Struct have different sizes {} != {}", + column_name, + data_type->getName(), + getCapnProtoFullTypeName(capnp_type), + nested_types.size(), + structure_fields.size()); + nested_names.reserve(structure_fields.size()); + for (auto field : structure_fields) + nested_names.push_back(field.getProto().getName()); + } + else + { + nested_names = tuple_data_type->getElementNames(); + } + + try + { + initialize(nested_types, nested_names, settings); + } + catch (Exception & e) + { + e.addMessage("(while converting column {})", column_name); + throw std::move(e); + } + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicStruct::Builder & parent_struct_builder, UInt32 slot_offset, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_struct_builder, slot_offset, row_num); + } + + void writeRow(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, capnp::DynamicList::Builder & parent_list_builder, UInt32 array_index, size_t row_num) override + { + writeRowImpl(column, field_builder, parent_list_builder, array_index, row_num); + } + + /// Method for writing root struct. + void writeRow(const Columns & columns, StructBuilder & struct_builder, size_t row_num) + { + for (size_t i = 0; i != columns.size(); ++i) + fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); + } + + void readRow(IColumn & column, const capnp::DynamicStruct::Reader & parent_struct_reader, UInt32 slot_offset) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_struct_reader.getReaderImpl().getPointerField(slot_offset).getStruct(nullptr)); + readRowImpl(column, struct_reader); + } + + void readRow(IColumn & column, const capnp::DynamicList::Reader & parent_list_reader, UInt32 array_index) override + { + auto struct_reader = capnp::DynamicStruct::Reader(struct_schema, parent_list_reader.getReaderImpl().getStructElement(array_index)); + readRowImpl(column, struct_reader); + } + + /// Method for reading from root struct. + void readRow(MutableColumns & columns, const capnp::DynamicStruct::Reader & reader) + { + for (size_t i = 0; i != columns.size(); ++i) + fields_serializers[i]->readRow(*columns[i], reader, fields_offsets[i]); + } + + private: + void initialize(const DataTypes & data_types, const Names & names, const FormatSettings::CapnProto & settings) + { + auto node = struct_schema.getProto().getStruct(); + struct_size = capnp::_::StructSize(node.getDataWordCount(), node.getPointerCount()); + fields_count = struct_schema.getFields().size(); + fields_serializers.reserve(data_types.size()); + fields_offsets.reserve(data_types.size()); + fields_indexes.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + { + auto [field_name, _] = splitFieldName(names[i]); + auto field = findFieldByName(struct_schema, field_name); + if (!field) + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name); + + auto capnp_type = field->getType(); + fields_serializers.push_back(createSerializer(data_types[i], names[i], capnp_type, settings)); + fields_offsets.push_back(field->getProto().getSlot().getOffset()); + fields_indexes.push_back(field->getIndex()); + } + } + + template <typename ParentBuilder> + void writeRowImpl(const ColumnPtr & column, std::unique_ptr<FieldBuilder> & field_builder, ParentBuilder & parent_builder, UInt32 offset_or_index, size_t row_num) + { + if (!field_builder) + field_builder = initStructBuilder(parent_builder, offset_or_index, struct_size, fields_count, struct_schema); + + auto & struct_builder = assert_cast<StructBuilder &>(*field_builder); + if (const auto * tuple_column = typeid_cast<const ColumnTuple *>(column.get())) + { + const auto & columns = tuple_column->getColumns(); + for (size_t i = 0; i != columns.size(); ++i) + fields_serializers[i]->writeRow(columns[i], struct_builder.field_builders[fields_indexes[i]], struct_builder.impl, fields_offsets[i], row_num); + } + else + { + fields_serializers[0]->writeRow(column, struct_builder.field_builders[fields_indexes[0]], struct_builder.impl, fields_offsets[0], row_num); + } + } + + void readRowImpl(IColumn & column, const capnp::DynamicStruct::Reader & struct_reader) + { + if (auto * tuple_column = typeid_cast<ColumnTuple *>(&column)) + { + for (size_t i = 0; i != tuple_column->tupleSize(); ++i) + fields_serializers[i]->readRow(tuple_column->getColumn(i), struct_reader, fields_offsets[i]); + } + else + fields_serializers[0]->readRow(column, struct_reader, fields_offsets[0]); + } + + capnp::StructSchema struct_schema; + capnp::_::StructSize struct_size; + size_t fields_count; + std::vector<std::unique_ptr<ICapnProtoSerializer>> fields_serializers; + std::vector<UInt32> fields_offsets; + std::vector<size_t> fields_indexes; + + }; + + std::unique_ptr<ICapnProtoSerializer> createSerializer(const DataTypePtr & type, const String & name, const capnp::Type & capnp_type, const FormatSettings::CapnProto & settings) + { + auto [field_name, nested_name] = splitFieldName(name); + if (!nested_name.empty() && !capnp_type.isList()) + { + if (!capnp_type.isStruct()) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name); + + return std::make_unique<CapnProtoStructureSerializer>(DataTypes{type}, Names{nested_name}, capnp_type.asStruct(), settings); + } + + switch (type->getTypeId()) + { + case TypeIndex::Int8: + return createIntegerSerializer<Int8>(type, name, capnp_type); + case TypeIndex::UInt8: + if (isBool(type)) + return createIntegerSerializer<UInt8, true>(type, name, capnp_type); + return createIntegerSerializer<UInt8>(type, name, capnp_type); + case TypeIndex::Int16: + return createIntegerSerializer<Int16>(type, name, capnp_type); + case TypeIndex::UInt16: + return createIntegerSerializer<UInt16>(type, name, capnp_type); + case TypeIndex::Int32: + return createIntegerSerializer<Int32>(type, name, capnp_type); + case TypeIndex::UInt32: + return createIntegerSerializer<UInt32>(type, name, capnp_type); + case TypeIndex::Int64: + return createIntegerSerializer<Int64>(type, name, capnp_type); + case TypeIndex::UInt64: + return createIntegerSerializer<UInt64>(type, name, capnp_type); + case TypeIndex::Int128: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<Int128>>(type, name, capnp_type); + case TypeIndex::UInt128: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<UInt128>>(type, name, capnp_type); + case TypeIndex::Int256: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<Int256>>(type, name, capnp_type); + case TypeIndex::UInt256: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<UInt256>>(type, name, capnp_type); + case TypeIndex::Float32: + return createFloatSerializer<Float32>(type, name, capnp_type); + case TypeIndex::Float64: + return createFloatSerializer<Float64>(type, name, capnp_type); + case TypeIndex::Date: + return std::make_unique<CapnProtoDateSerializer>(type, name, capnp_type); + case TypeIndex::Date32: + return std::make_unique<CapnProtoDate32Serializer>(type, name, capnp_type); + case TypeIndex::DateTime: + return std::make_unique<CapnProtoDateTimeSerializer>(type, name, capnp_type); + case TypeIndex::DateTime64: + return std::make_unique<CapnProtoDateTime64Serializer>(type, name, capnp_type); + case TypeIndex::Decimal32: + return std::make_unique<CapnProtoDecimalSerializer<Decimal32>>(type, name, capnp_type); + case TypeIndex::Decimal64: + return std::make_unique<CapnProtoDecimalSerializer<Decimal64>>(type, name, capnp_type); + case TypeIndex::Decimal128: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<Decimal128>>(type, name, capnp_type); + case TypeIndex::Decimal256: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<Decimal256>>(type, name, capnp_type); + case TypeIndex::IPv4: + return std::make_unique<CapnProtoIPv4Serializer>(type, name, capnp_type); + case TypeIndex::IPv6: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<IPv6>>(type, name, capnp_type); + case TypeIndex::UUID: + return std::make_unique<CapnProtoFixedSizeRawDataSerializer<UUID>>(type, name, capnp_type); + case TypeIndex::Enum8: + return std::make_unique<CapnProtoEnumSerializer<Int8>>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::Enum16: + return std::make_unique<CapnProtoEnumSerializer<Int16>>(type, name, capnp_type, settings.enum_comparing_mode); + case TypeIndex::String: + if (capnp_type.isData()) + return std::make_unique<CapnProtoStringSerializer<capnp::Data>>(type, name, capnp_type); + return std::make_unique<CapnProtoStringSerializer<capnp::Text>>(type, name, capnp_type); + case TypeIndex::FixedString: + if (capnp_type.isData()) + return std::make_unique<CapnProtoFixedStringSerializer<capnp::Data>>(type, name, capnp_type); + return std::make_unique<CapnProtoFixedStringSerializer<capnp::Text>>(type, name, capnp_type); + case TypeIndex::LowCardinality: + return std::make_unique<CapnProtoLowCardinalitySerializer>(type, name, capnp_type, settings); + case TypeIndex::Nullable: + return std::make_unique<CapnProtoNullableSerializer>(type, name, capnp_type, settings); + case TypeIndex::Array: + return std::make_unique<CapnProtoArraySerializer>(type, name, capnp_type, settings); + case TypeIndex::Map: + return std::make_unique<CapnProtoMapSerializer>(type, name, capnp_type, settings); + case TypeIndex::Tuple: + return std::make_unique<CapnProtoStructureSerializer>(type, name, capnp_type, settings); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in CapnProto format", type->getName()); + } + } +} + +class CapnProtoSerializer::Impl +{ +public: + Impl(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : struct_serializer(std::make_unique<CapnProtoStructureSerializer>(data_types, names, schema, settings)) + , fields_size(schema.getFields().size()) + { + } + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) + { + StructBuilder struct_builder(std::move(builder), fields_size); + struct_serializer->writeRow(columns, struct_builder, row_num); + } + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) + { + struct_serializer->readRow(columns, reader); + } + +private: + std::unique_ptr<CapnProtoStructureSerializer> struct_serializer; + size_t fields_size; +}; + +CapnProtoSerializer::CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings) + : serializer_impl(std::make_unique<Impl>(data_types, names, schema, settings)) +{ +} + +void CapnProtoSerializer::writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num) +{ + serializer_impl->writeRow(columns, std::move(builder), row_num); +} + +void CapnProtoSerializer::readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader) +{ + serializer_impl->readRow(columns, reader); +} + +CapnProtoSerializer::~CapnProtoSerializer() = default; + +} + +#endif diff --git a/contrib/clickhouse/src/Formats/CapnProtoSerializer.h b/contrib/clickhouse/src/Formats/CapnProtoSerializer.h new file mode 100644 index 0000000000..5af9be0526 --- /dev/null +++ b/contrib/clickhouse/src/Formats/CapnProtoSerializer.h @@ -0,0 +1,30 @@ +#pragma once + +#if USE_CAPNP + +#include <Core/Block.h> +#error #include <capnp/dynamic.h> +#include <Formats/FormatSettings.h> + +namespace DB +{ + +class CapnProtoSerializer +{ +public: + CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings); + + void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num); + + void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader); + + ~CapnProtoSerializer(); + +private: + class Impl; + std::unique_ptr<Impl> serializer_impl; +}; + +} + +#endif diff --git a/contrib/clickhouse/src/Formats/ColumnMapping.cpp b/contrib/clickhouse/src/Formats/ColumnMapping.cpp new file mode 100644 index 0000000000..e33dfc878f --- /dev/null +++ b/contrib/clickhouse/src/Formats/ColumnMapping.cpp @@ -0,0 +1,71 @@ +#include <Formats/ColumnMapping.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + +void ColumnMapping::setupByHeader(const Block & header) +{ + column_indexes_for_input_fields.resize(header.columns()); + names_of_columns = header.getNames(); + + for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i) + column_indexes_for_input_fields[i] = i; +} + +void ColumnMapping::addColumns( + const Names & column_names, const Block::NameMap & column_indexes_by_names, const FormatSettings & settings) +{ + std::vector<bool> read_columns(column_indexes_by_names.size(), false); + + for (const auto & name : column_names) + { + names_of_columns.push_back(name); + + const auto column_it = column_indexes_by_names.find(name); + if (column_it == column_indexes_by_names.end()) + { + if (settings.skip_unknown_fields) + { + column_indexes_for_input_fields.push_back(std::nullopt); + continue; + } + + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Unknown field found in format header: " + "'{}' at position {}\nSet the 'input_format_skip_unknown_fields' parameter explicitly " + "to ignore and proceed", + name, column_indexes_for_input_fields.size()); + } + + const auto column_index = column_it->second; + + if (read_columns[column_index]) + throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing format header: {}", name); + + read_columns[column_index] = true; + column_indexes_for_input_fields.emplace_back(column_index); + } + + for (size_t i = 0; i != read_columns.size(); ++i) + { + if (!read_columns[i]) + not_presented_columns.push_back(i); + } +} + +void ColumnMapping::insertDefaultsForNotSeenColumns(MutableColumns & columns, std::vector<UInt8> & read_columns) +{ + for (auto index : not_presented_columns) + { + columns[index]->insertDefault(); + read_columns[index] = false; + } +} + +} diff --git a/contrib/clickhouse/src/Formats/ColumnMapping.h b/contrib/clickhouse/src/Formats/ColumnMapping.h new file mode 100644 index 0000000000..c20e598580 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ColumnMapping.h @@ -0,0 +1,36 @@ +#pragma once + +#include <Core/Block.h> +#include <Formats/FormatSettings.h> + +namespace DB +{ + +/// Used for input text formats with headers/structure to map columns from input +/// and columns in header by names. +/// It's also used to pass info from header between different InputFormats in ParallelParsing +struct ColumnMapping +{ + /// Special flag for ParallelParsing. Non-atomic because there is strict + /// `happens-before` between read and write access. See InputFormatParallelParsing + bool is_set{false}; + + /// Maps indexes of columns in the input file to indexes of table columns + using OptionalIndexes = std::vector<std::optional<size_t>>; + OptionalIndexes column_indexes_for_input_fields; + + /// The list of column indexes that are not presented in input data. + std::vector<size_t> not_presented_columns; + + /// The list of column names in input data. Needed for better exception messages. + std::vector<String> names_of_columns; + + void setupByHeader(const Block & header); + + void addColumns( + const Names & column_names, const Block::NameMap & column_indexes_by_names, const FormatSettings & settings); + + void insertDefaultsForNotSeenColumns(MutableColumns & columns, std::vector<UInt8> & read_columns); +}; + +} diff --git a/contrib/clickhouse/src/Formats/EscapingRuleUtils.cpp b/contrib/clickhouse/src/Formats/EscapingRuleUtils.cpp new file mode 100644 index 0000000000..9f744218da --- /dev/null +++ b/contrib/clickhouse/src/Formats/EscapingRuleUtils.cpp @@ -0,0 +1,482 @@ +#include <Formats/EscapingRuleUtils.h> +#include <Formats/SchemaInferenceUtils.h> +#include <DataTypes/Serializations/SerializationNullable.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/ReadBufferFromString.h> +#include <IO/parseDateTimeBestEffort.h> +#include <Parsers/TokenIterator.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) +{ + if (escaping_rule.empty()) + return FormatSettings::EscapingRule::None; + else if (escaping_rule == "None") + return FormatSettings::EscapingRule::None; + else if (escaping_rule == "Escaped") + return FormatSettings::EscapingRule::Escaped; + else if (escaping_rule == "Quoted") + return FormatSettings::EscapingRule::Quoted; + else if (escaping_rule == "CSV") + return FormatSettings::EscapingRule::CSV; + else if (escaping_rule == "JSON") + return FormatSettings::EscapingRule::JSON; + else if (escaping_rule == "XML") + return FormatSettings::EscapingRule::XML; + else if (escaping_rule == "Raw") + return FormatSettings::EscapingRule::Raw; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown escaping rule \"{}\"", escaping_rule); +} + +String escapingRuleToString(FormatSettings::EscapingRule escaping_rule) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::None: + return "None"; + case FormatSettings::EscapingRule::Escaped: + return "Escaped"; + case FormatSettings::EscapingRule::Quoted: + return "Quoted"; + case FormatSettings::EscapingRule::CSV: + return "CSV"; + case FormatSettings::EscapingRule::JSON: + return "JSON"; + case FormatSettings::EscapingRule::XML: + return "XML"; + case FormatSettings::EscapingRule::Raw: + return "Raw"; + } + UNREACHABLE(); +} + +void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + NullOutput out; + constexpr const char * field_name = "<SKIPPED COLUMN>"; + constexpr size_t field_name_len = 16; + switch (escaping_rule) + { + case FormatSettings::EscapingRule::None: + /// Empty field, just skip spaces + break; + case FormatSettings::EscapingRule::Escaped: + readEscapedStringInto(out, buf); + break; + case FormatSettings::EscapingRule::Quoted: + readQuotedFieldInto(out, buf); + break; + case FormatSettings::EscapingRule::CSV: + readCSVStringInto(out, buf, format_settings.csv); + break; + case FormatSettings::EscapingRule::JSON: + skipJSONField(buf, StringRef(field_name, field_name_len)); + break; + case FormatSettings::EscapingRule::Raw: + readStringInto(out, buf); + break; + default: + UNREACHABLE(); + } +} + +bool deserializeFieldByEscapingRule( + const DataTypePtr & type, + const SerializationPtr & serialization, + IColumn & column, + ReadBuffer & buf, + FormatSettings::EscapingRule escaping_rule, + const FormatSettings & format_settings) +{ + bool read = true; + bool parse_as_nullable = format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type); + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Escaped: + if (parse_as_nullable) + read = SerializationNullable::deserializeTextEscapedImpl(column, buf, format_settings, serialization); + else + serialization->deserializeTextEscaped(column, buf, format_settings); + break; + case FormatSettings::EscapingRule::Quoted: + if (parse_as_nullable) + read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); + else + serialization->deserializeTextQuoted(column, buf, format_settings); + break; + case FormatSettings::EscapingRule::CSV: + if (parse_as_nullable) + read = SerializationNullable::deserializeTextCSVImpl(column, buf, format_settings, serialization); + else + serialization->deserializeTextCSV(column, buf, format_settings); + break; + case FormatSettings::EscapingRule::JSON: + if (parse_as_nullable) + read = SerializationNullable::deserializeTextJSONImpl(column, buf, format_settings, serialization); + else + serialization->deserializeTextJSON(column, buf, format_settings); + break; + case FormatSettings::EscapingRule::Raw: + if (parse_as_nullable) + read = SerializationNullable::deserializeTextRawImpl(column, buf, format_settings, serialization); + else + serialization->deserializeTextRaw(column, buf, format_settings); + break; + default: + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule)); + } + return read; +} + +void serializeFieldByEscapingRule( + const IColumn & column, + const ISerialization & serialization, + WriteBuffer & out, + size_t row_num, + FormatSettings::EscapingRule escaping_rule, + const FormatSettings & format_settings) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Escaped: + serialization.serializeTextEscaped(column, row_num, out, format_settings); + break; + case FormatSettings::EscapingRule::Quoted: + serialization.serializeTextQuoted(column, row_num, out, format_settings); + break; + case FormatSettings::EscapingRule::CSV: + serialization.serializeTextCSV(column, row_num, out, format_settings); + break; + case FormatSettings::EscapingRule::JSON: + serialization.serializeTextJSON(column, row_num, out, format_settings); + break; + case FormatSettings::EscapingRule::XML: + serialization.serializeTextXML(column, row_num, out, format_settings); + break; + case FormatSettings::EscapingRule::Raw: + serialization.serializeTextRaw(column, row_num, out, format_settings); + break; + case FormatSettings::EscapingRule::None: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize field with None escaping rule"); + } +} + +void writeStringByEscapingRule( + const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Quoted: + writeQuotedString(value, out); + break; + case FormatSettings::EscapingRule::JSON: + writeJSONString(value, out, format_settings); + break; + case FormatSettings::EscapingRule::Raw: + writeString(value, out); + break; + case FormatSettings::EscapingRule::CSV: + writeCSVString(value, out); + break; + case FormatSettings::EscapingRule::Escaped: + writeEscapedString(value, out); + break; + case FormatSettings::EscapingRule::XML: + writeXMLStringForTextElement(value, out); + break; + case FormatSettings::EscapingRule::None: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot serialize string with None escaping rule"); + } +} + +template <bool read_string> +String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + String result; + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Quoted: + if constexpr (read_string) + readQuotedString(result, buf); + else + readQuotedField(result, buf); + break; + case FormatSettings::EscapingRule::JSON: + if constexpr (read_string) + readJSONString(result, buf); + else + readJSONField(result, buf); + break; + case FormatSettings::EscapingRule::Raw: + readString(result, buf); + break; + case FormatSettings::EscapingRule::CSV: + if constexpr (read_string) + readCSVString(result, buf, format_settings.csv); + else + readCSVField(result, buf, format_settings.csv); + break; + case FormatSettings::EscapingRule::Escaped: + if constexpr (read_string) + readEscapedString(result, buf); + else + readTSVField(result, buf); + break; + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); + } + return result; +} + +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule<false>(buf, escaping_rule, format_settings); +} + +String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule<true>(buf, escaping_rule, format_settings); +} + +String readStringOrFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + /// For Quoted escaping rule we can read value as string only if it starts with `'`. + /// If there is no `'` it can be any other field number/array/etc. + if (escaping_rule == FormatSettings::EscapingRule::Quoted && !buf.eof() && *buf.position() != '\'') + return readFieldByEscapingRule(buf, escaping_rule, format_settings); + + /// For JSON it's the same as for Quoted, but we check `"`. + if (escaping_rule == FormatSettings::EscapingRule::JSON && !buf.eof() && *buf.position() != '"') + return readFieldByEscapingRule(buf, escaping_rule, format_settings); + + /// For other escaping rules we can read any field as string value. + return readStringByEscapingRule(buf, escaping_rule, format_settings); +} + +DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Quoted: + return tryInferDataTypeForSingleField(field, format_settings); + case FormatSettings::EscapingRule::JSON: + return tryInferDataTypeForSingleJSONField(field, format_settings, json_info); + case FormatSettings::EscapingRule::CSV: + { + if (!format_settings.csv.use_best_effort_in_schema_inference) + return std::make_shared<DataTypeString>(); + + if (field.empty()) + return nullptr; + + if (field == format_settings.csv.null_representation) + return makeNullable(std::make_shared<DataTypeNothing>()); + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return DataTypeFactory::instance().get("Bool"); + + /// In CSV complex types are serialized in quotes. If we have quotes, we should try to infer type + /// from data inside quotes. + if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"'))) + { + auto data = std::string_view(field.data() + 1, field.size() - 2); + /// First, try to infer dates and datetimes. + if (auto date_type = tryInferDateOrDateTimeFromString(data, format_settings)) + return date_type; + + /// Try to determine the type of value inside quotes + auto type = tryInferDataTypeForSingleField(data, format_settings); + + /// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string. + if (!type || isNumber(removeNullable(type)) || isTuple(type)) + return std::make_shared<DataTypeString>(); + + return type; + } + + /// Case when CSV value is not in quotes. Check if it's a number or date/datetime, and if not, determine it as a string. + if (auto number_type = tryInferNumberFromString(field, format_settings)) + return number_type; + + if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings)) + return date_type; + + return std::make_shared<DataTypeString>(); + } + case FormatSettings::EscapingRule::Raw: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: + { + if (!format_settings.tsv.use_best_effort_in_schema_inference) + return std::make_shared<DataTypeString>(); + + if (field.empty()) + return nullptr; + + if (field == format_settings.tsv.null_representation) + return makeNullable(std::make_shared<DataTypeNothing>()); + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return DataTypeFactory::instance().get("Bool"); + + if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings)) + return date_type; + + /// Special case when we have number that starts with 0. In TSV we don't parse such numbers, + /// see readIntTextUnsafe in ReadHelpers.h. If we see data started with 0, we can determine it + /// as a String, so parsing won't fail. + if (field[0] == '0' && field.size() != 1) + return std::make_shared<DataTypeString>(); + + auto type = tryInferDataTypeForSingleField(field, format_settings); + if (!type) + return std::make_shared<DataTypeString>(); + return type; + } + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", + escapingRuleToString(escaping_rule)); + } +} + +DataTypes tryInferDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info) +{ + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, escaping_rule, json_info)); + return data_types; +} + +void transformInferredTypesByEscapingRuleIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::JSON: + transformInferredJSONTypesIfNeeded(first, second, settings, json_info); + break; + case FormatSettings::EscapingRule::Escaped: [[fallthrough]]; + case FormatSettings::EscapingRule::Raw: [[fallthrough]]; + case FormatSettings::EscapingRule::Quoted: [[fallthrough]]; + case FormatSettings::EscapingRule::CSV: + transformInferredTypesIfNeeded(first, second, settings); + break; + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Cannot transform inferred types for value with {} escaping rule", + escapingRuleToString(escaping_rule)); + } +} + + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::CSV: + case FormatSettings::EscapingRule::Escaped: + case FormatSettings::EscapingRule::Raw: + return std::make_shared<DataTypeString>(); + default: + return nullptr; + } +} + +DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules) +{ + DataTypes data_types; + for (const auto & rule : escaping_rules) + data_types.push_back(getDefaultDataTypeForEscapingRule(rule)); + return data_types; +} + +String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings) +{ + return fmt::format( + "schema_inference_hints={}, max_rows_to_read_for_schema_inference={}, max_bytes_to_read_for_schema_inference={}, schema_inference_make_columns_nullable={}", + settings.schema_inference_hints, + settings.max_rows_to_read_for_schema_inference, + settings.max_bytes_to_read_for_schema_inference, + settings.schema_inference_make_columns_nullable); +} + +String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule) +{ + String result = getAdditionalFormatInfoForAllRowBasedFormats(settings); + /// First, settings that are common for all text formats: + result += fmt::format( + ", try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}", + settings.try_infer_integers, + settings.try_infer_dates, + settings.try_infer_datetimes); + + /// Second, format-specific settings: + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Escaped: + case FormatSettings::EscapingRule::Raw: + result += fmt::format( + ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}, null_representation={}", + settings.tsv.use_best_effort_in_schema_inference, + settings.bool_true_representation, + settings.bool_false_representation, + settings.tsv.null_representation); + break; + case FormatSettings::EscapingRule::CSV: + result += fmt::format( + ", use_best_effort_in_schema_inference={}, bool_true_representation={}, bool_false_representation={}," + " null_representation={}, delimiter={}, tuple_delimiter={}", + settings.csv.use_best_effort_in_schema_inference, + settings.bool_true_representation, + settings.bool_false_representation, + settings.csv.null_representation, + settings.csv.delimiter, + settings.csv.tuple_delimiter); + break; + case FormatSettings::EscapingRule::JSON: + result += fmt::format( + ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_objects_as_strings={}, read_numbers_as_strings={}, try_infer_objects={}", + settings.json.try_infer_numbers_from_strings, + settings.json.read_bools_as_numbers, + settings.json.read_objects_as_strings, + settings.json.read_numbers_as_strings, + settings.json.allow_object_type); + break; + default: + break; + } + + return result; +} + + +void checkSupportedDelimiterAfterField(FormatSettings::EscapingRule escaping_rule, const String & delimiter, const DataTypePtr & type) +{ + if (escaping_rule != FormatSettings::EscapingRule::Escaped) + return; + + bool is_supported_delimiter_after_string = !delimiter.empty() && (delimiter.front() == '\t' || delimiter.front() == '\n'); + if (is_supported_delimiter_after_string) + return; + + /// Nullptr means that field is skipped and it's equivalent to String + if (!type || isString(removeNullable(removeLowCardinality(type)))) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "'Escaped' serialization requires delimiter after String field to start with '\\t' or '\\n'"); +} + +} diff --git a/contrib/clickhouse/src/Formats/EscapingRuleUtils.h b/contrib/clickhouse/src/Formats/EscapingRuleUtils.h new file mode 100644 index 0000000000..274b21107f --- /dev/null +++ b/contrib/clickhouse/src/Formats/EscapingRuleUtils.h @@ -0,0 +1,64 @@ +#pragma once + +#include <Formats/FormatSettings.h> +#include <Formats/SchemaInferenceUtils.h> +#include <DataTypes/IDataType.h> +#include <DataTypes/Serializations/ISerialization.h> +#include <IO/ReadBuffer.h> +#include <Interpreters/Context.h> + +namespace DB +{ + +FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule); + +String escapingRuleToString(FormatSettings::EscapingRule escaping_rule); + +void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); + +bool deserializeFieldByEscapingRule( + const DataTypePtr & type, + const SerializationPtr & serialization, + IColumn & column, + ReadBuffer & buf, + FormatSettings::EscapingRule escaping_rule, + const FormatSettings & format_settings); + +void serializeFieldByEscapingRule( + const IColumn & column, + const ISerialization & serialization, + WriteBuffer & out, + size_t row_num, + FormatSettings::EscapingRule escaping_rule, + const FormatSettings & format_settings); + +void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); + +/// Read String serialized in specified escaping rule. +String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); +/// Read any field serialized in specified escaping rule. It can be any fild like number/array/etc. +/// This function should return value exactly as it was in the data without changes +/// (for example without parsing escaped sequences) +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); +/// In case if we don't know if we have String value or not, but need to read String values as String (with correct escaped sequences parsing). +String readStringOrFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); + +/// Try to determine the type of the field written by a specific escaping rule. +/// If cannot, return nullptr. +/// See tryInferDataTypeForSingle(JSON)Field in SchemaInferenceUtils.h +DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr); +DataTypes tryInferDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr); + +/// Check if we need to transform types inferred from data and transform it if necessary. +/// See transformInferred(JSON)TypesIfNeeded in SchemaInferenceUtils.h +void transformInferredTypesByEscapingRuleIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr); + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); +DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules); + +String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings); +String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule); + +void checkSupportedDelimiterAfterField(FormatSettings::EscapingRule escaping_rule, const String & delimiter, const DataTypePtr & type); + +} diff --git a/contrib/clickhouse/src/Formats/FormatFactory.cpp b/contrib/clickhouse/src/Formats/FormatFactory.cpp new file mode 100644 index 0000000000..c349ebd94e --- /dev/null +++ b/contrib/clickhouse/src/Formats/FormatFactory.cpp @@ -0,0 +1,790 @@ +#include <Formats/FormatFactory.h> + +#include <algorithm> +#include <Core/Settings.h> +#include <Formats/FormatSettings.h> +#include <Interpreters/Context.h> +#include <Interpreters/ProcessList.h> +#include <IO/SharedThreadPools.h> +#include <Processors/Formats/IRowInputFormat.h> +#include <Processors/Formats/IRowOutputFormat.h> +#include <Processors/Formats/Impl/MySQLOutputFormat.h> +#include <Processors/Formats/Impl/ParallelFormattingOutputFormat.h> +#include <Processors/Formats/Impl/ParallelParsingInputFormat.h> +#include <Processors/Formats/Impl/ValuesBlockInputFormat.h> +#include <Poco/URI.h> +#include <Common/Exception.h> +#include <Common/KnownObjectNames.h> +#include <unistd.h> + +#include <boost/algorithm/string/case_conv.hpp> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_FORMAT; + extern const int LOGICAL_ERROR; + extern const int FORMAT_IS_NOT_SUITABLE_FOR_INPUT; + extern const int FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT; + extern const int BAD_ARGUMENTS; +} + +const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const +{ + auto it = dict.find(name); + if (dict.end() != it) + return it->second; + throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); +} + +FormatSettings getFormatSettings(ContextPtr context) +{ + const auto & settings = context->getSettingsRef(); + + return getFormatSettings(context, settings); +} + +template <typename Settings> +FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) +{ + FormatSettings format_settings; + + format_settings.avro.allow_missing_fields = settings.input_format_avro_allow_missing_fields; + format_settings.avro.output_codec = settings.output_format_avro_codec; + format_settings.avro.output_sync_interval = settings.output_format_avro_sync_interval; + format_settings.avro.schema_registry_url = settings.format_avro_schema_registry_url.toString(); + format_settings.avro.string_column_pattern = settings.output_format_avro_string_column_pattern.toString(); + format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file; + format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes; + format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; + format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; + format_settings.csv.delimiter = settings.format_csv_delimiter; + format_settings.csv.tuple_delimiter = settings.format_csv_delimiter; + format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; + format_settings.csv.enum_as_number = settings.input_format_csv_enum_as_number; + format_settings.csv.null_representation = settings.format_csv_null_representation; + format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; + format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference; + format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines; + format_settings.csv.try_detect_header = settings.input_format_csv_detect_header; + format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines; + format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; + format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter; + format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns; + format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values; + format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; + format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; + format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; + format_settings.custom.escaping_rule = settings.format_custom_escaping_rule; + format_settings.custom.field_delimiter = settings.format_custom_field_delimiter; + format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter; + format_settings.custom.result_before_delimiter = settings.format_custom_result_before_delimiter; + format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter; + format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter; + format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; + format_settings.custom.try_detect_header = settings.input_format_custom_detect_header; + format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines; + format_settings.custom.allow_variable_number_of_columns = settings.input_format_custom_allow_variable_number_of_columns; + format_settings.date_time_input_format = settings.date_time_input_format; + format_settings.date_time_output_format = settings.date_time_output_format; + format_settings.interval.output_format = settings.interval_output_format; + format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error; + format_settings.input_format_ipv6_default_on_conversion_error = settings.input_format_ipv6_default_on_conversion_error; + format_settings.bool_true_representation = settings.bool_true_representation; + format_settings.bool_false_representation = settings.bool_false_representation; + format_settings.enable_streaming = settings.output_format_enable_streaming; + format_settings.import_nested_json = settings.input_format_import_nested_json; + format_settings.input_allow_errors_num = settings.input_format_allow_errors_num; + format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio; + format_settings.json.array_of_rows = settings.output_format_json_array_of_rows; + format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes; + format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects; + format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects; + format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple; + format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple; + format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; + format_settings.json.quote_64bit_floats = settings.output_format_json_quote_64bit_floats; + format_settings.json.quote_denormals = settings.output_format_json_quote_denormals; + format_settings.json.quote_decimals = settings.output_format_json_quote_decimals; + format_settings.json.read_bools_as_numbers = settings.input_format_json_read_bools_as_numbers; + format_settings.json.read_numbers_as_strings = settings.input_format_json_read_numbers_as_strings; + format_settings.json.read_objects_as_strings = settings.input_format_json_read_objects_as_strings; + format_settings.json.try_infer_numbers_from_strings = settings.input_format_json_try_infer_numbers_from_strings; + format_settings.json.validate_types_from_metadata = settings.input_format_json_validate_types_from_metadata; + format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8; + format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name; + format_settings.json.allow_object_type = context->getSettingsRef().allow_experimental_object_type; + format_settings.json.compact_allow_variable_number_of_columns = settings.input_format_json_compact_allow_variable_number_of_columns; + format_settings.null_as_default = settings.input_format_null_as_default; + format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; + format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size; + format_settings.parquet.row_group_bytes = settings.output_format_parquet_row_group_size_bytes; + format_settings.parquet.output_version = settings.output_format_parquet_version; + format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; + format_settings.parquet.preserve_order = settings.input_format_parquet_preserve_order; + format_settings.parquet.filter_push_down = settings.input_format_parquet_filter_push_down; + format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; + format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string; + format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings.output_format_parquet_fixed_string_as_fixed_byte_array; + format_settings.parquet.max_block_size = settings.input_format_parquet_max_block_size; + format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method; + format_settings.parquet.output_compliant_nested_types = settings.output_format_parquet_compliant_nested_types; + format_settings.parquet.use_custom_encoder = settings.output_format_parquet_use_custom_encoder; + format_settings.parquet.parallel_encoding = settings.output_format_parquet_parallel_encoding; + format_settings.parquet.data_page_size = settings.output_format_parquet_data_page_size; + format_settings.parquet.write_batch_size = settings.output_format_parquet_batch_size; + format_settings.parquet.local_read_min_bytes_for_seek = settings.input_format_parquet_local_file_min_bytes_for_seek; + format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; + format_settings.pretty.color = settings.output_format_pretty_color; + format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; + format_settings.pretty.max_rows = settings.output_format_pretty_max_rows; + format_settings.pretty.max_value_width = settings.output_format_pretty_max_value_width; + format_settings.pretty.output_format_pretty_row_numbers = settings.output_format_pretty_row_numbers; + format_settings.protobuf.input_flatten_google_wrappers = settings.input_format_protobuf_flatten_google_wrappers; + format_settings.protobuf.output_nullables_with_google_wrappers = settings.output_format_protobuf_nullables_with_google_wrappers; + format_settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference; + format_settings.protobuf.use_autogenerated_schema = settings.format_protobuf_use_autogenerated_schema; + format_settings.regexp.escaping_rule = settings.format_regexp_escaping_rule; + format_settings.regexp.regexp = settings.format_regexp; + format_settings.regexp.skip_unmatched = settings.format_regexp_skip_unmatched; + format_settings.schema.format_schema = settings.format_schema; + format_settings.schema.format_schema_path = context->getFormatSchemaPath(); + format_settings.schema.is_server = context->hasGlobalContext() && (context->getGlobalContext()->getApplicationType() == Context::ApplicationType::SERVER); + format_settings.schema.output_format_schema = settings.output_format_schema; + format_settings.skip_unknown_fields = settings.input_format_skip_unknown_fields; + format_settings.template_settings.resultset_format = settings.format_template_resultset; + format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; + format_settings.template_settings.row_format = settings.format_template_row; + format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; + format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; + format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; + format_settings.tsv.null_representation = settings.format_tsv_null_representation; + format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference; + format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines; + format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header; + format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines; + format_settings.tsv.allow_variable_number_of_columns = settings.input_format_tsv_allow_variable_number_of_columns; + format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; + format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; + format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; + format_settings.with_names_use_header = settings.input_format_with_names_use_header; + format_settings.with_types_use_header = settings.input_format_with_types_use_header; + format_settings.write_statistics = settings.output_format_write_statistics; + format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; + format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; + format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; + format_settings.arrow.output_string_as_string = settings.output_format_arrow_string_as_string; + format_settings.arrow.output_fixed_string_as_fixed_byte_array = settings.output_format_arrow_fixed_string_as_fixed_byte_array; + format_settings.arrow.output_compression_method = settings.output_format_arrow_compression_method; + format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; + format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; + format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; + format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; + format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; + format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; + format_settings.orc.output_compression_method = settings.output_format_orc_compression_method; + format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder; + format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; + format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; + format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference; + format_settings.capn_proto.use_autogenerated_schema = settings.format_capn_proto_use_autogenerated_schema; + format_settings.seekable_read = settings.input_format_allow_seeks; + format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; + format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation; + format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; + format_settings.max_bytes_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; + format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference; + format_settings.schema_inference_hints = settings.schema_inference_hints; + format_settings.schema_inference_make_columns_nullable = settings.schema_inference_make_columns_nullable; + format_settings.mysql_dump.table_name = settings.input_format_mysql_dump_table_name; + format_settings.mysql_dump.map_column_names = settings.input_format_mysql_dump_map_column_names; + format_settings.sql_insert.max_batch_size = settings.output_format_sql_insert_max_batch_size; + format_settings.sql_insert.include_column_names = settings.output_format_sql_insert_include_column_names; + format_settings.sql_insert.table_name = settings.output_format_sql_insert_table_name; + format_settings.sql_insert.use_replace = settings.output_format_sql_insert_use_replace; + format_settings.sql_insert.quote_names = settings.output_format_sql_insert_quote_names; + format_settings.try_infer_integers = settings.input_format_try_infer_integers; + format_settings.try_infer_dates = settings.input_format_try_infer_dates; + format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes; + format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string; + format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_bson_skip_fields_with_unsupported_types_in_schema_inference; + format_settings.max_binary_string_size = settings.format_binary_max_string_size; + format_settings.max_binary_array_size = settings.format_binary_max_array_size; + format_settings.native.allow_types_conversion = settings.input_format_native_allow_types_conversion; + format_settings.max_parser_depth = context->getSettingsRef().max_parser_depth; + format_settings.client_protocol_version = context->getClientProtocolVersion(); + + /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context + if (format_settings.schema.is_server) + { + const Poco::URI & avro_schema_registry_url = settings.format_avro_schema_registry_url; + if (!avro_schema_registry_url.empty()) + context->getRemoteHostFilter().checkURL(avro_schema_registry_url); + } + + return format_settings; +} + +template FormatSettings getFormatSettings<FormatFactorySettings>(ContextPtr context, const FormatFactorySettings & settings); + +template FormatSettings getFormatSettings<Settings>(ContextPtr context, const Settings & settings); + + +InputFormatPtr FormatFactory::getInput( + const String & name, + ReadBuffer & _buf, + const Block & sample, + ContextPtr context, + UInt64 max_block_size, + const std::optional<FormatSettings> & _format_settings, + std::optional<size_t> _max_parsing_threads, + std::optional<size_t> _max_download_threads, + bool is_remote_fs, + CompressionMethod compression) const +{ + const auto& creators = getCreators(name); + if (!creators.input_creator && !creators.random_access_input_creator) + throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT, "Format {} is not suitable for input", name); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + const Settings & settings = context->getSettingsRef(); + size_t max_parsing_threads = _max_parsing_threads.value_or(settings.max_threads); + size_t max_download_threads = _max_download_threads.value_or(settings.max_download_threads); + + RowInputFormatParams row_input_format_params; + row_input_format_params.max_block_size = max_block_size; + row_input_format_params.allow_errors_num = format_settings.input_allow_errors_num; + row_input_format_params.allow_errors_ratio = format_settings.input_allow_errors_ratio; + row_input_format_params.max_execution_time = settings.max_execution_time; + row_input_format_params.timeout_overflow_mode = settings.timeout_overflow_mode; + + if (context->hasQueryContext() && settings.log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); + + // Add ParallelReadBuffer and decompression if needed. + + auto owned_buf = wrapReadBufferIfNeeded(_buf, compression, creators, format_settings, settings, is_remote_fs, max_download_threads); + auto & buf = owned_buf ? *owned_buf : _buf; + + // Decide whether to use ParallelParsingInputFormat. + + bool parallel_parsing = max_parsing_threads > 1 && settings.input_format_parallel_parsing && creators.file_segmentation_engine && !creators.random_access_input_creator; + + if (settings.max_memory_usage && settings.min_chunk_bytes_for_parallel_parsing * max_parsing_threads * 2 > settings.max_memory_usage) + parallel_parsing = false; + if (settings.max_memory_usage_for_user && settings.min_chunk_bytes_for_parallel_parsing * max_parsing_threads * 2 > settings.max_memory_usage_for_user) + parallel_parsing = false; + + if (parallel_parsing) + { + const auto & non_trivial_prefix_and_suffix_checker = creators.non_trivial_prefix_and_suffix_checker; + /// Disable parallel parsing for input formats with non-trivial readPrefix() and readSuffix(). + if (non_trivial_prefix_and_suffix_checker && non_trivial_prefix_and_suffix_checker(buf)) + parallel_parsing = false; + } + + // Create the InputFormat in one of 3 ways. + + InputFormatPtr format; + + if (parallel_parsing) + { + const auto & input_getter = creators.input_creator; + + /// Const reference is copied to lambda. + auto parser_creator = [input_getter, sample, row_input_format_params, format_settings] + (ReadBuffer & input) -> InputFormatPtr + { return input_getter(input, sample, row_input_format_params, format_settings); }; + + ParallelParsingInputFormat::Params params{ + buf, sample, parser_creator, creators.file_segmentation_engine, name, max_parsing_threads, + settings.min_chunk_bytes_for_parallel_parsing, max_block_size, context->getApplicationType() == Context::ApplicationType::SERVER}; + + format = std::make_shared<ParallelParsingInputFormat>(params); + } + else if (creators.random_access_input_creator) + { + format = creators.random_access_input_creator( + buf, + sample, + format_settings, + context->getReadSettings(), + is_remote_fs, + max_download_threads, + max_parsing_threads); + } + else + { + format = creators.input_creator(buf, sample, row_input_format_params, format_settings); + } + + if (owned_buf) + format->addBuffer(std::move(owned_buf)); + if (!settings.input_format_record_errors_file_path.toString().empty()) + format->setErrorsLogger(std::make_shared<ParallelInputFormatErrorsLogger>(context)); + + /// It's a kludge. Because I cannot remove context from values format. + /// (Not needed in the parallel_parsing case above because VALUES format doesn't support it.) + if (auto * values = typeid_cast<ValuesBlockInputFormat *>(format.get())) + values->setContext(context); + + return format; +} + +std::unique_ptr<ReadBuffer> FormatFactory::wrapReadBufferIfNeeded( + ReadBuffer & buf, + CompressionMethod compression, + const Creators & creators, + const FormatSettings & format_settings, + const Settings & settings, + bool is_remote_fs, + size_t max_download_threads) const +{ + std::unique_ptr<ReadBuffer> res; + + bool parallel_read = is_remote_fs && max_download_threads > 1 && format_settings.seekable_read && isBufferWithFileSize(buf); + if (creators.random_access_input_creator) + parallel_read &= compression != CompressionMethod::None; + size_t file_size = 0; + + if (parallel_read) + { + try + { + file_size = getFileSizeFromReadBuffer(buf); + parallel_read = file_size >= 2 * settings.max_download_buffer_size; + } + catch (const Poco::Exception & e) + { + parallel_read = false; + LOG_TRACE( + &Poco::Logger::get("FormatFactory"), + "Failed to setup ParallelReadBuffer because of an exception:\n{}.\n" + "Falling back to the single-threaded buffer", + e.displayText()); + } + } + + if (parallel_read) + { + LOG_TRACE( + &Poco::Logger::get("FormatFactory"), + "Using ParallelReadBuffer with {} workers with chunks of {} bytes", + max_download_threads, + settings.max_download_buffer_size); + + res = wrapInParallelReadBufferIfSupported( + buf, threadPoolCallbackRunner<void>(getIOThreadPool().get(), "ParallelRead"), + max_download_threads, settings.max_download_buffer_size, file_size); + } + + if (compression != CompressionMethod::None) + { + if (!res) + res = wrapReadBufferReference(buf); + res = wrapReadBufferWithCompressionMethod(std::move(res), compression, static_cast<int>(settings.zstd_window_log_max)); + } + + return res; +} + +static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context) +{ + auto element_id = context->getProcessListElement(); + if (element_id) + { + /// While preparing the query there might have been progress (for example in subscalar subqueries) so add it here + auto current_progress = element_id->getProgressIn(); + Progress read_progress{current_progress.read_rows, current_progress.read_bytes, current_progress.total_rows_to_read}; + format->onProgress(read_progress); + + /// Update the start of the statistics to use the start of the query, and not the creation of the format class + format->setStartTime(element_id->getQueryCPUStartTime(), true); + } +} + +OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextPtr context, + const std::optional<FormatSettings> & _format_settings) const +{ + const auto & output_getter = getCreators(name).output_creator; + if (!output_getter) + throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT, "Format {} is not suitable for output", name); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + + const Settings & settings = context->getSettingsRef(); + + if (settings.output_format_parallel_formatting && getCreators(name).supports_parallel_formatting + && !settings.output_format_json_array_of_rows) + { + auto formatter_creator = [output_getter, sample, format_settings] (WriteBuffer & output) -> OutputFormatPtr + { + return output_getter(output, sample, format_settings); + }; + + ParallelFormattingOutputFormat::Params builder{buf, sample, formatter_creator, settings.max_threads}; + + if (context->hasQueryContext() && settings.log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); + + auto format = std::make_shared<ParallelFormattingOutputFormat>(builder); + addExistingProgressToOutputFormat(format, context); + return format; + } + + return getOutputFormat(name, buf, sample, context, format_settings); +} + + +OutputFormatPtr FormatFactory::getOutputFormat( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextPtr context, + const std::optional<FormatSettings> & _format_settings) const +{ + const auto & output_getter = getCreators(name).output_creator; + if (!output_getter) + throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT, "Format {} is not suitable for output", name); + + if (context->hasQueryContext() && context->getSettingsRef().log_queries) + context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + format_settings.max_threads = context->getSettingsRef().max_threads; + + /** TODO: Materialization is needed, because formats can use the functions `IDataType`, + * which only work with full columns. + */ + auto format = output_getter(buf, sample, format_settings); + + /// Enable auto-flush for streaming mode. Currently it is needed by INSERT WATCH query. + if (format_settings.enable_streaming) + format->setAutoFlush(); + + /// It's a kludge. Because I cannot remove context from MySQL format. + if (auto * mysql = typeid_cast<MySQLOutputFormat *>(format.get())) + mysql->setContext(context); + + addExistingProgressToOutputFormat(format, context); + + return format; +} + +String FormatFactory::getContentType( + const String & name, + ContextPtr context, + const std::optional<FormatSettings> & _format_settings) const +{ + const auto & output_getter = getCreators(name).output_creator; + if (!output_getter) + throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT, "Format {} is not suitable for output", name); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + + Block empty_block; + WriteBufferFromOwnString empty_buffer; + auto format = output_getter(empty_buffer, empty_block, format_settings); + + return format->getContentType(); +} + +SchemaReaderPtr FormatFactory::getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr & context, + const std::optional<FormatSettings> & _format_settings) const +{ + const auto & schema_reader_creator = dict.at(name).schema_reader_creator; + if (!schema_reader_creator) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + auto schema_reader = schema_reader_creator(buf, format_settings); + if (schema_reader->needContext()) + schema_reader->setContext(context); + return schema_reader; +} + +ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( + const String & name, + ContextPtr & context, + const std::optional<FormatSettings> & _format_settings) const +{ + const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; + if (!external_schema_reader_creator) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return external_schema_reader_creator(format_settings); +} + +void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator) +{ + chassert(input_creator); + auto & creators = dict[name]; + if (creators.input_creator || creators.random_access_input_creator) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name); + creators.input_creator = std::move(input_creator); + registerFileExtension(name, name); + KnownFormatNames::instance().add(name); +} + +void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomAccessInputCreator input_creator) +{ + chassert(input_creator); + auto & creators = dict[name]; + if (creators.input_creator || creators.random_access_input_creator) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name); + creators.random_access_input_creator = std::move(input_creator); + registerFileExtension(name, name); + KnownFormatNames::instance().add(name); +} + +void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker) +{ + auto & target = dict[name].non_trivial_prefix_and_suffix_checker; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Non trivial prefix and suffix checker {} is already registered", name); + target = std::move(non_trivial_prefix_and_suffix_checker); +} + +void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker) +{ + auto & target = dict[name].append_support_checker; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Suffix checker {} is already registered", name); + target = std::move(append_support_checker); +} + +void FormatFactory::markFormatHasNoAppendSupport(const String & name) +{ + registerAppendSupportChecker(name, [](const FormatSettings &){ return false; }); +} + +bool FormatFactory::checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_) +{ + auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); + auto & append_support_checker = dict[name].append_support_checker; + /// By default we consider that format supports append + return !append_support_checker || append_support_checker(format_settings); +} + +void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator) +{ + auto & target = dict[name].output_creator; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already registered", name); + target = std::move(output_creator); + registerFileExtension(name, name); + KnownFormatNames::instance().add(name); +} + +void FormatFactory::registerFileExtension(const String & extension, const String & format_name) +{ + file_extension_formats[boost::to_lower_copy(extension)] = format_name; +} + +String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_found) +{ + if (file_name == "stdin") + return getFormatFromFileDescriptor(STDIN_FILENO); + + CompressionMethod compression_method = chooseCompressionMethod(file_name, ""); + if (CompressionMethod::None != compression_method) + { + auto pos = file_name.find_last_of('.'); + if (pos != String::npos) + file_name = file_name.substr(0, pos); + } + + auto pos = file_name.find_last_of('.'); + if (pos == String::npos) + { + if (throw_if_not_found) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); + return ""; + } + + String file_extension = file_name.substr(pos + 1, String::npos); + boost::algorithm::to_lower(file_extension); + auto it = file_extension_formats.find(file_extension); + if (it == file_extension_formats.end()) + { + if (throw_if_not_found) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); + return ""; + } + return it->second; +} + +String FormatFactory::getFormatFromFileDescriptor(int fd) +{ +#ifdef OS_LINUX + std::string proc_path = fmt::format("/proc/self/fd/{}", fd); + char file_path[PATH_MAX] = {'\0'}; + if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) + return getFormatFromFileName(file_path, false); + return ""; +#elif defined(OS_DARWIN) + char file_path[PATH_MAX] = {'\0'}; + if (fcntl(fd, F_GETPATH, file_path) != -1) + return getFormatFromFileName(file_path, false); + return ""; +#else + (void)fd; + return ""; +#endif +} + +void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine) +{ + auto & target = dict[name].file_segmentation_engine; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: File segmentation engine {} is already registered", name); + target = std::move(file_segmentation_engine); +} + +void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator) +{ + auto & target = dict[name].schema_reader_creator; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name); + target = std::move(schema_reader_creator); +} + +void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator) +{ + auto & target = dict[name].external_schema_reader_creator; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name); + target = std::move(external_schema_reader_creator); +} + +void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name) +{ + auto & target = dict[name].supports_parallel_formatting; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already marked as supporting parallel formatting", name); + target = true; +} + + +void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name) +{ + auto & target = dict[name].subset_of_columns_support_checker; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name); + target = [](const FormatSettings &){ return true; }; +} + +void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker) +{ + auto & target = dict[name].subset_of_columns_support_checker; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name); + target = std::move(subset_of_columns_support_checker); +} + +void FormatFactory::markOutputFormatPrefersLargeBlocks(const String & name) +{ + auto & target = dict[name].prefers_large_blocks; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as preferring large blocks", name); + target = true; +} + +bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const DB::String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_) const +{ + const auto & target = getCreators(name); + auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); + return target.subset_of_columns_support_checker && target.subset_of_columns_support_checker(format_settings); +} + +void FormatFactory::registerAdditionalInfoForSchemaCacheGetter( + const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter) +{ + auto & target = dict[name].additional_info_for_schema_cache_getter; + if (target) + throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: additional info for schema cache getter {} is already registered", name); + target = std::move(additional_info_for_schema_cache_getter); +} + +String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_) +{ + const auto & additional_info_getter = getCreators(name).additional_info_for_schema_cache_getter; + if (!additional_info_getter) + return ""; + + auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); + return additional_info_getter(format_settings); +} + +bool FormatFactory::isInputFormat(const String & name) const +{ + auto it = dict.find(name); + return it != dict.end() && (it->second.input_creator || it->second.random_access_input_creator); +} + +bool FormatFactory::isOutputFormat(const String & name) const +{ + auto it = dict.find(name); + return it != dict.end() && it->second.output_creator; +} + +bool FormatFactory::checkIfFormatHasSchemaReader(const String & name) const +{ + const auto & target = getCreators(name); + return bool(target.schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name) const +{ + const auto & target = getCreators(name); + return bool(target.external_schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name) const +{ + return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name); +} + +bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) const +{ + const auto & target = getCreators(name); + return target.prefers_large_blocks; +} + +bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const +{ + if (name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order) + return false; + + return true; +} + +void FormatFactory::checkFormatName(const String & name) const +{ + auto it = dict.find(name); + if (it == dict.end()) + throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); +} + +FormatFactory & FormatFactory::instance() +{ + static FormatFactory ret; + return ret; +} + +} diff --git a/contrib/clickhouse/src/Formats/FormatFactory.h b/contrib/clickhouse/src/Formats/FormatFactory.h new file mode 100644 index 0000000000..48a1869d56 --- /dev/null +++ b/contrib/clickhouse/src/Formats/FormatFactory.h @@ -0,0 +1,274 @@ +#pragma once + +#include <Common/Allocator.h> +#include <Columns/IColumn.h> +#include <Formats/FormatSettings.h> +#include <Interpreters/Context_fwd.h> +#include <IO/BufferWithOwnMemory.h> +#include <IO/CompressionMethod.h> +#include <IO/ParallelReadBuffer.h> +#include <base/types.h> +#include <Core/NamesAndTypes.h> + +#include <boost/noncopyable.hpp> + +#include <functional> +#include <memory> +#include <unordered_map> + + +namespace DB +{ + +class Block; +struct Settings; +struct FormatFactorySettings; + +class ReadBuffer; +class WriteBuffer; + +class IProcessor; +using ProcessorPtr = std::shared_ptr<IProcessor>; + +class IInputFormat; +class IOutputFormat; +class IRowOutputFormat; + +struct RowInputFormatParams; + +class ISchemaReader; +class IExternalSchemaReader; +using SchemaReaderPtr = std::shared_ptr<ISchemaReader>; +using ExternalSchemaReaderPtr = std::shared_ptr<IExternalSchemaReader>; + +using InputFormatPtr = std::shared_ptr<IInputFormat>; +using OutputFormatPtr = std::shared_ptr<IOutputFormat>; +using RowOutputFormatPtr = std::shared_ptr<IRowOutputFormat>; + +template <typename Allocator> +struct Memory; + +FormatSettings getFormatSettings(ContextPtr context); + +template <typename T> +FormatSettings getFormatSettings(ContextPtr context, const T & settings); + +/** Allows to create an IInputFormat or IOutputFormat by the name of the format. + * Note: format and compression are independent things. + */ +class FormatFactory final : private boost::noncopyable +{ +public: + /** Fast reading data from buffer and save result to memory. + * Reads at least `min_bytes` and some more until the end of the chunk, depends on the format. + * If `max_rows` is non-zero the function also stops after reading the `max_rows` number of rows + * (even if the `min_bytes` boundary isn't reached yet). + * Used in ParallelParsingInputFormat. + */ + using FileSegmentationEngine = std::function<std::pair<bool, size_t>( + ReadBuffer & buf, + DB::Memory<Allocator<false>> & memory, + size_t min_bytes, + size_t max_rows)>; + +private: + // On the input side, there are two kinds of formats: + // * InputCreator - formats parsed sequentially, e.g. CSV. Almost all formats are like this. + // FormatFactory uses ParallelReadBuffer to read in parallel, and ParallelParsingInputFormat + // to parse in parallel; the formats mostly don't need to worry about it. + // * RandomAccessInputCreator - column-oriented formats that require seeking back and forth in + // the file when reading. E.g. Parquet has metadata at the end of the file (needs to be read + // before we can parse any data), can skip columns by seeking in the file, and often reads + // many short byte ranges from the file. ParallelReadBuffer and ParallelParsingInputFormat + // are a poor fit. Instead, the format implementation is in charge of parallelizing both + // reading and parsing. + + using InputCreator = std::function<InputFormatPtr( + ReadBuffer & buf, + const Block & header, + const RowInputFormatParams & params, + const FormatSettings & settings)>; + + // Incompatible with FileSegmentationEngine. + using RandomAccessInputCreator = std::function<InputFormatPtr( + ReadBuffer & buf, + const Block & header, + const FormatSettings & settings, + const ReadSettings& read_settings, + bool is_remote_fs, + size_t max_download_threads, + size_t max_parsing_threads)>; + + using OutputCreator = std::function<OutputFormatPtr( + WriteBuffer & buf, + const Block & sample, + const FormatSettings & settings)>; + + /// Some input formats can have non trivial readPrefix() and readSuffix(), + /// so in some cases there is no possibility to use parallel parsing. + /// The checker should return true if parallel parsing should be disabled. + using NonTrivialPrefixAndSuffixChecker = std::function<bool(ReadBuffer & buf)>; + + /// Some formats can support append depending on settings. + /// The checker should return true if format support append. + using AppendSupportChecker = std::function<bool(const FormatSettings & settings)>; + + using SchemaReaderCreator = std::function<SchemaReaderPtr(ReadBuffer & in, const FormatSettings & settings)>; + using ExternalSchemaReaderCreator = std::function<ExternalSchemaReaderPtr(const FormatSettings & settings)>; + + /// Some formats can extract different schemas from the same source depending on + /// some settings. To process this case in schema cache we should add some additional + /// information to a cache key. This getter should return some string with information + /// about such settings. For example, for Protobuf format it's the path to the schema + /// and the name of the message. + using AdditionalInfoForSchemaCacheGetter = std::function<String(const FormatSettings & settings)>; + + /// Some formats can support reading subset of columns depending on settings. + /// The checker should return true if format support append. + using SubsetOfColumnsSupportChecker = std::function<bool(const FormatSettings & settings)>; + + struct Creators + { + InputCreator input_creator; + RandomAccessInputCreator random_access_input_creator; + OutputCreator output_creator; + FileSegmentationEngine file_segmentation_engine; + SchemaReaderCreator schema_reader_creator; + ExternalSchemaReaderCreator external_schema_reader_creator; + bool supports_parallel_formatting{false}; + bool prefers_large_blocks{false}; + NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker; + AppendSupportChecker append_support_checker; + AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter; + SubsetOfColumnsSupportChecker subset_of_columns_support_checker; + }; + + using FormatsDictionary = std::unordered_map<String, Creators>; + using FileExtensionFormats = std::unordered_map<String, String>; + +public: + static FormatFactory & instance(); + + /// This has two tricks up its sleeve: + /// * Parallel reading. + /// To enable it, make sure `buf` is a SeekableReadBuffer implementing readBigAt(). + /// * Parallel parsing. + /// `buf` must outlive the returned IInputFormat. + InputFormatPtr getInput( + const String & name, + ReadBuffer & buf, + const Block & sample, + ContextPtr context, + UInt64 max_block_size, + const std::optional<FormatSettings> & format_settings = std::nullopt, + std::optional<size_t> max_parsing_threads = std::nullopt, + std::optional<size_t> max_download_threads = std::nullopt, + // affects things like buffer sizes and parallel reading + bool is_remote_fs = false, + // allows to do: buf -> parallel read -> decompression, + // because parallel read after decompression is not possible + CompressionMethod compression = CompressionMethod::None) const; + + /// Checks all preconditions. Returns ordinary format if parallel formatting cannot be done. + OutputFormatPtr getOutputFormatParallelIfPossible( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextPtr context, + const std::optional<FormatSettings> & format_settings = std::nullopt) const; + + OutputFormatPtr getOutputFormat( + const String & name, + WriteBuffer & buf, + const Block & sample, + ContextPtr context, + const std::optional<FormatSettings> & _format_settings = std::nullopt) const; + + String getContentType( + const String & name, + ContextPtr context, + const std::optional<FormatSettings> & format_settings = std::nullopt) const; + + SchemaReaderPtr getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr & context, + const std::optional<FormatSettings> & format_settings = std::nullopt) const; + + ExternalSchemaReaderPtr getExternalSchemaReader( + const String & name, + ContextPtr & context, + const std::optional<FormatSettings> & format_settings = std::nullopt) const; + + void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); + + void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker); + + void registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker); + + /// If format always doesn't support append, you can use this method instead of + /// registerAppendSupportChecker with append_support_checker that always returns true. + void markFormatHasNoAppendSupport(const String & name); + + bool checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_ = std::nullopt); + + /// Register format by its name. + void registerInputFormat(const String & name, InputCreator input_creator); + void registerRandomAccessInputFormat(const String & name, RandomAccessInputCreator input_creator); + void registerOutputFormat(const String & name, OutputCreator output_creator); + + /// Register file extension for format + void registerFileExtension(const String & extension, const String & format_name); + String getFormatFromFileName(String file_name, bool throw_if_not_found = false); + String getFormatFromFileDescriptor(int fd); + + /// Register schema readers for format its name. + void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); + void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator); + + void markOutputFormatSupportsParallelFormatting(const String & name); + void markOutputFormatPrefersLargeBlocks(const String & name); + + void markFormatSupportsSubsetOfColumns(const String & name); + void registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker); + bool checkIfFormatSupportsSubsetOfColumns(const String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_ = std::nullopt) const; + + bool checkIfFormatHasSchemaReader(const String & name) const; + bool checkIfFormatHasExternalSchemaReader(const String & name) const; + bool checkIfFormatHasAnySchemaReader(const String & name) const; + bool checkIfOutputFormatPrefersLargeBlocks(const String & name) const; + + bool checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const; + + void registerAdditionalInfoForSchemaCacheGetter(const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter); + String getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_ = std::nullopt); + + const FormatsDictionary & getAllFormats() const + { + return dict; + } + + bool isInputFormat(const String & name) const; + bool isOutputFormat(const String & name) const; + + /// Check that format with specified name exists and throw an exception otherwise. + void checkFormatName(const String & name) const; + +private: + FormatsDictionary dict; + FileExtensionFormats file_extension_formats; + + const Creators & getCreators(const String & name) const; + + // Creates a ReadBuffer to give to an input format. Returns nullptr if we should use `buf` directly. + std::unique_ptr<ReadBuffer> wrapReadBufferIfNeeded( + ReadBuffer & buf, + CompressionMethod compression, + const Creators & creators, + const FormatSettings & format_settings, + const Settings & settings, + bool is_remote_fs, + size_t max_download_threads) const; +}; + +} diff --git a/contrib/clickhouse/src/Formats/FormatSchemaInfo.cpp b/contrib/clickhouse/src/Formats/FormatSchemaInfo.cpp new file mode 100644 index 0000000000..c0f0aec6fd --- /dev/null +++ b/contrib/clickhouse/src/Formats/FormatSchemaInfo.cpp @@ -0,0 +1,190 @@ +#include <Formats/FormatSchemaInfo.h> +#include <Interpreters/Context.h> +#include <Common/Exception.h> +#include <Common/filesystemHelpers.h> +#include <Disks/IO/WriteBufferFromTemporaryFile.h> +#include <filesystem> + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace fs = std::filesystem; + +namespace +{ + String getFormatSchemaDefaultFileExtension(const String & format) + { + if (format == "Protobuf") + return "proto"; + else if (format == "CapnProto") + return "capnp"; + else + return ""; + } +} + + +FormatSchemaInfo::FormatSchemaInfo(const String & format_schema, const String & format, bool require_message, bool is_server, const std::string & format_schema_path) +{ + if (format_schema.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The format {} requires a schema. The corresponding setting should be set", format); + + String default_file_extension = getFormatSchemaDefaultFileExtension(format); + + fs::path path; + if (require_message) + { + size_t colon_pos = format_schema.find(':'); + if ((colon_pos == String::npos) || (colon_pos == 0) || (colon_pos == format_schema.length() - 1)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Format schema requires the 'format_schema' setting to have the 'schema_file:message_name' format{}. Got '{}'", + (default_file_extension.empty() ? "" : ", e.g. 'schema." + default_file_extension + ":Message'"), format_schema); + } + else + { + path = fs::path(format_schema.substr(0, colon_pos)); + String filename = path.has_filename() ? path.filename() : path.parent_path().filename(); + if (filename.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Format schema requires the 'format_schema' setting to have the 'schema_file:message_name' format{}. Got '{}'", + (default_file_extension.empty() ? "" : ", e.g. 'schema." + default_file_extension + ":Message'"), format_schema); + } + message_name = format_schema.substr(colon_pos + 1); + } + else + { + path = fs::path(format_schema); + if (!path.has_filename()) + path = path.parent_path() / ""; + } + + auto default_schema_directory = [&format_schema_path]() + { + static const String str = fs::canonical(format_schema_path) / ""; + return str; + }; + + if (!path.has_extension() && !default_file_extension.empty()) + path = path.parent_path() / (path.stem().string() + '.' + default_file_extension); + + fs::path default_schema_directory_path(default_schema_directory()); + if (path.is_absolute()) + { + if (is_server) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Absolute path in the 'format_schema' setting is prohibited: {}", path.string()); + schema_path = path.filename(); + schema_directory = path.parent_path() / ""; + } + else if (path.has_parent_path() && !fs::weakly_canonical(default_schema_directory_path / path).string().starts_with(fs::weakly_canonical(default_schema_directory_path).string())) + { + if (is_server) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})", + default_schema_directory(), + path.string(), + default_schema_directory()); + path = default_schema_directory_path / path; + schema_path = path.filename(); + schema_directory = path.parent_path() / ""; + } + else + { + schema_path = path; + schema_directory = default_schema_directory(); + } +} + +FormatSchemaInfo::FormatSchemaInfo(const FormatSettings & settings, const String & format, bool require_message) + : FormatSchemaInfo( + settings.schema.format_schema, format, require_message, settings.schema.is_server, settings.schema.format_schema_path) +{ +} + +template <typename SchemaGenerator> +MaybeAutogeneratedFormatSchemaInfo<SchemaGenerator>::MaybeAutogeneratedFormatSchemaInfo( + const FormatSettings & settings, const String & format, const Block & header, bool use_autogenerated_schema) +{ + if (!use_autogenerated_schema || !settings.schema.format_schema.empty()) + { + schema_info = std::make_unique<FormatSchemaInfo>(settings, format, true); + return; + } + + String schema_path; + fs::path default_schema_directory_path(fs::canonical(settings.schema.format_schema_path) / ""); + fs::path path; + if (!settings.schema.output_format_schema.empty()) + { + schema_path = settings.schema.output_format_schema; + path = schema_path; + if (path.is_absolute()) + { + if (settings.schema.is_server) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Absolute path in the 'output_format_schema' setting is prohibited: {}", path.string()); + } + else if (path.has_parent_path() && !fs::weakly_canonical(default_schema_directory_path / path).string().starts_with(fs::weakly_canonical(default_schema_directory_path).string())) + { + if (settings.schema.is_server) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})", + default_schema_directory_path.string(), + path.string(), + default_schema_directory_path.string()); + path = default_schema_directory_path / path; + } + else + { + path = default_schema_directory_path / path; + } + } + else + { + if (settings.schema.is_server) + { + tmp_file_path = PocoTemporaryFile::tempName(default_schema_directory_path.string()) + '.' + getFormatSchemaDefaultFileExtension(format); + schema_path = fs::path(tmp_file_path).filename(); + } + else + { + tmp_file_path = PocoTemporaryFile::tempName() + '.' + getFormatSchemaDefaultFileExtension(format); + schema_path = tmp_file_path; + } + + path = tmp_file_path; + } + + WriteBufferFromFile buf(path.string()); + SchemaGenerator::writeSchema(buf, "Message", header.getNamesAndTypesList()); + buf.finalize(); + + schema_info = std::make_unique<FormatSchemaInfo>(schema_path + ":Message", format, true, settings.schema.is_server, settings.schema.format_schema_path); +} + +template <typename SchemaGenerator> +MaybeAutogeneratedFormatSchemaInfo<SchemaGenerator>::~MaybeAutogeneratedFormatSchemaInfo() +{ + if (!tmp_file_path.empty()) + { + try + { + fs::remove(tmp_file_path); + } + catch (...) + { + tryLogCurrentException("MaybeAutogeneratedFormatSchemaInfo", "Cannot delete temporary schema file"); + } + } +} + +template class MaybeAutogeneratedFormatSchemaInfo<StructureToCapnProtoSchema>; +template class MaybeAutogeneratedFormatSchemaInfo<StructureToProtobufSchema>; + +} diff --git a/contrib/clickhouse/src/Formats/FormatSchemaInfo.h b/contrib/clickhouse/src/Formats/FormatSchemaInfo.h new file mode 100644 index 0000000000..e8758c3f76 --- /dev/null +++ b/contrib/clickhouse/src/Formats/FormatSchemaInfo.h @@ -0,0 +1,54 @@ +#pragma once + +#include <base/types.h> +#include <Formats/FormatSettings.h> +#include <Formats/StructureToCapnProtoSchema.h> +#include <Formats/StructureToProtobufSchema.h> + +namespace DB +{ +class Context; + +/// Extracts information about where the format schema file is from passed context and keep it. +class FormatSchemaInfo +{ +public: + FormatSchemaInfo(const String & format_schema, const String & format, bool require_message, bool is_server, const std::string & format_schema_path); + FormatSchemaInfo(const FormatSettings & settings, const String & format, bool require_message); + + /// Returns path to the schema file. + const String & schemaPath() const { return schema_path; } + String absoluteSchemaPath() const { return schema_directory + schema_path; } + + /// Returns directory containing the schema file. + const String & schemaDirectory() const { return schema_directory; } + + /// Returns name of the message type. + const String & messageName() const { return message_name; } + +private: + String schema_path; + String schema_directory; + String message_name; +}; + + +template <typename SchemaGenerator> +class MaybeAutogeneratedFormatSchemaInfo +{ +public: + MaybeAutogeneratedFormatSchemaInfo(const FormatSettings & settings, const String & format, const Block & header, bool use_autogenerated_schema); + + ~MaybeAutogeneratedFormatSchemaInfo(); + + const FormatSchemaInfo & getSchemaInfo() const { return *schema_info; } +private: + + std::unique_ptr<FormatSchemaInfo> schema_info; + String tmp_file_path; +}; + +using CapnProtoSchemaInfo = MaybeAutogeneratedFormatSchemaInfo<StructureToCapnProtoSchema>; +using ProtobufSchemaInfo = MaybeAutogeneratedFormatSchemaInfo<StructureToProtobufSchema>; + +} diff --git a/contrib/clickhouse/src/Formats/FormatSettings.h b/contrib/clickhouse/src/Formats/FormatSettings.h new file mode 100644 index 0000000000..0c760f9151 --- /dev/null +++ b/contrib/clickhouse/src/Formats/FormatSettings.h @@ -0,0 +1,410 @@ +#pragma once + +#include <Core/Names.h> +#include <Core/Defines.h> +#include <base/types.h> +#include <base/unit.h> + + +namespace DB +{ + +/** + * Various tweaks for input/output formats. Text serialization/deserialization + * of data types also depend on some of these settings. It is different from + * FormatFactorySettings in that it has all necessary user-provided settings + * combined with information from context etc, that we can use directly during + * serialization. In contrast, FormatFactorySettings' job is to reflect the + * changes made to user-visible format settings, such as when tweaking the + * the format for File engine. + * NOTE Parameters for unrelated formats and unrelated data types are collected + * in this struct - it prevents modularity, but they are difficult to separate. + */ +struct FormatSettings +{ + /// Format will be used for streaming. Not every formats support it + /// Option means that each chunk of data need to be formatted independently. Also each chunk will be flushed at the end of processing. + bool enable_streaming = false; + + bool skip_unknown_fields = false; + bool with_names_use_header = false; + bool with_types_use_header = false; + bool write_statistics = true; + bool import_nested_json = false; + bool null_as_default = true; + bool decimal_trailing_zeros = false; + bool defaults_for_omitted_fields = true; + + bool seekable_read = true; + UInt64 max_rows_to_read_for_schema_inference = 25000; + UInt64 max_bytes_to_read_for_schema_inference = 32 * 1024 * 1024; + + String column_names_for_schema_inference; + String schema_inference_hints; + + bool try_infer_integers = false; + bool try_infer_dates = false; + bool try_infer_datetimes = false; + + enum class DateTimeInputFormat + { + Basic, /// Default format for fast parsing: YYYY-MM-DD hh:mm:ss (ISO-8601 without fractional part and timezone) or NNNNNNNNNN unix timestamp. + BestEffort, /// Use sophisticated rules to parse whatever possible. + BestEffortUS /// Use sophisticated rules to parse American style: mm/dd/yyyy + }; + + DateTimeInputFormat date_time_input_format = DateTimeInputFormat::Basic; + + enum class DateTimeOutputFormat + { + Simple, + ISO, + UnixTimestamp + }; + + enum class EscapingRule + { + None, + Escaped, + Quoted, + CSV, + JSON, + XML, + Raw + }; + + bool schema_inference_make_columns_nullable = true; + + DateTimeOutputFormat date_time_output_format = DateTimeOutputFormat::Simple; + + enum class IntervalOutputFormat + { + Kusto, + Numeric + }; + + struct + { + IntervalOutputFormat output_format = IntervalOutputFormat::Numeric; + } interval; + + bool input_format_ipv4_default_on_conversion_error = false; + bool input_format_ipv6_default_on_conversion_error = false; + + UInt64 input_allow_errors_num = 0; + Float32 input_allow_errors_ratio = 0; + + UInt64 max_binary_string_size = 1_GiB; + UInt64 max_binary_array_size = 1_GiB; + UInt64 client_protocol_version = 0; + + UInt64 max_parser_depth = DBMS_DEFAULT_MAX_PARSER_DEPTH; + + size_t max_threads = 1; + + enum class ArrowCompression + { + NONE, + LZ4_FRAME, + ZSTD + }; + + struct + { + UInt64 row_group_size = 1000000; + bool low_cardinality_as_dictionary = false; + bool allow_missing_columns = false; + bool skip_columns_with_unsupported_types_in_schema_inference = false; + bool case_insensitive_column_matching = false; + bool output_string_as_string = false; + bool output_fixed_string_as_fixed_byte_array = true; + ArrowCompression output_compression_method = ArrowCompression::NONE; + } arrow; + + struct + { + String schema_registry_url; + String output_codec; + UInt64 output_sync_interval = 16 * 1024; + bool allow_missing_fields = false; + String string_column_pattern; + UInt64 output_rows_in_file = 1; + } avro; + + String bool_true_representation = "true"; + String bool_false_representation = "false"; + + struct CSV + { + char delimiter = ','; + bool allow_single_quotes = true; + bool allow_double_quotes = true; + bool empty_as_default = false; + bool crlf_end_of_line = false; + bool enum_as_number = false; + bool arrays_as_nested_csv = false; + String null_representation = "\\N"; + char tuple_delimiter = ','; + bool use_best_effort_in_schema_inference = true; + UInt64 skip_first_lines = 0; + String custom_delimiter; + bool try_detect_header = true; + bool skip_trailing_empty_lines = false; + bool trim_whitespaces = true; + bool allow_whitespace_or_tab_as_delimiter = false; + bool allow_variable_number_of_columns = false; + bool use_default_on_bad_values = false; + } csv; + + struct HiveText + { + char fields_delimiter = '\x01'; + char collection_items_delimiter = '\x02'; + char map_keys_delimiter = '\x03'; + Names input_field_names; + } hive_text; + + struct Custom + { + std::string result_before_delimiter; + std::string result_after_delimiter; + std::string row_before_delimiter; + std::string row_after_delimiter; + std::string row_between_delimiter; + std::string field_delimiter; + EscapingRule escaping_rule = EscapingRule::Escaped; + bool try_detect_header = true; + bool skip_trailing_empty_lines = false; + bool allow_variable_number_of_columns = false; + } custom; + + struct + { + bool array_of_rows = false; + bool quote_64bit_integers = true; + bool quote_64bit_floats = false; + bool quote_denormals = true; + bool quote_decimals = false; + bool escape_forward_slashes = true; + bool read_named_tuples_as_objects = false; + bool write_named_tuples_as_objects = false; + bool defaults_for_missing_elements_in_named_tuple = false; + bool ignore_unknown_keys_in_named_tuple = false; + bool serialize_as_strings = false; + bool read_bools_as_numbers = true; + bool read_numbers_as_strings = true; + bool read_objects_as_strings = true; + bool try_infer_numbers_from_strings = false; + bool validate_types_from_metadata = true; + bool validate_utf8 = false; + bool allow_object_type = false; + bool compact_allow_variable_number_of_columns = false; + } json; + + struct + { + String column_for_object_name; + } json_object_each_row; + + enum class ParquetVersion + { + V1_0, + V2_4, + V2_6, + V2_LATEST, + }; + + enum class ParquetCompression + { + NONE, + SNAPPY, + ZSTD, + LZ4, + GZIP, + BROTLI, + }; + + struct + { + UInt64 row_group_rows = 1000000; + UInt64 row_group_bytes = 512 * 1024 * 1024; + bool allow_missing_columns = false; + bool skip_columns_with_unsupported_types_in_schema_inference = false; + bool case_insensitive_column_matching = false; + bool filter_push_down = true; + std::unordered_set<int> skip_row_groups = {}; + bool output_string_as_string = false; + bool output_fixed_string_as_fixed_byte_array = true; + bool preserve_order = false; + bool use_custom_encoder = true; + bool parallel_encoding = true; + UInt64 max_block_size = 8192; + ParquetVersion output_version; + ParquetCompression output_compression_method = ParquetCompression::SNAPPY; + bool output_compliant_nested_types = true; + size_t data_page_size = 1024 * 1024; + size_t write_batch_size = 1024; + size_t local_read_min_bytes_for_seek = 8192; + } parquet; + + struct Pretty + { + UInt64 max_rows = 10000; + UInt64 max_column_pad_width = 250; + UInt64 max_value_width = 10000; + bool color = true; + + bool output_format_pretty_row_numbers = false; + + enum class Charset + { + UTF8, + ASCII, + }; + + Charset charset = Charset::UTF8; + } pretty; + + struct + { + bool input_flatten_google_wrappers = false; + bool output_nullables_with_google_wrappers = false; + /** + * Some buffers (kafka / rabbit) split the rows internally using callback, + * and always send one row per message, so we can push there formats + * without framing / delimiters (like ProtobufSingle). In other cases, + * we have to enforce exporting at most one row in the format output, + * because Protobuf without delimiters is not generally useful. + */ + bool allow_multiple_rows_without_delimiter = false; + bool skip_fields_with_unsupported_types_in_schema_inference = false; + bool use_autogenerated_schema = true; + } protobuf; + + struct + { + uint32_t client_capabilities = 0; + size_t max_packet_size = 0; + uint8_t * sequence_id = nullptr; /// Not null if it's MySQLWire output format used to handle MySQL protocol connections. + } mysql_wire; + + struct + { + std::string regexp; + EscapingRule escaping_rule = EscapingRule::Raw; + bool skip_unmatched = false; + } regexp; + + struct + { + std::string format_schema; + std::string format_schema_path; + bool is_server = false; + std::string output_format_schema; + } schema; + + struct + { + String resultset_format; + String row_format; + String row_between_delimiter; + } template_settings; + + struct + { + bool empty_as_default = false; + bool crlf_end_of_line = false; + String null_representation = "\\N"; + bool enum_as_number = false; + bool use_best_effort_in_schema_inference = true; + UInt64 skip_first_lines = 0; + bool try_detect_header = true; + bool skip_trailing_empty_lines = false; + bool allow_variable_number_of_columns = false; + } tsv; + + struct + { + bool interpret_expressions = true; + bool deduce_templates_of_expressions = true; + bool accurate_types_of_literals = true; + } values; + + enum class ORCCompression + { + NONE, + LZ4, + SNAPPY, + ZSTD, + ZLIB, + }; + + struct + { + bool allow_missing_columns = false; + int64_t row_batch_size = 100'000; + bool skip_columns_with_unsupported_types_in_schema_inference = false; + bool case_insensitive_column_matching = false; + std::unordered_set<int> skip_stripes = {}; + bool output_string_as_string = false; + ORCCompression output_compression_method = ORCCompression::NONE; + bool use_fast_decoder = true; + } orc; + + /// For capnProto format we should determine how to + /// compare ClickHouse Enum and Enum from schema. + enum class CapnProtoEnumComparingMode + { + BY_NAMES, // Names in enums should be the same, values can be different. + BY_NAMES_CASE_INSENSITIVE, // Case-insensitive name comparison. + BY_VALUES, // Values should be the same, names can be different. + }; + + struct CapnProto + { + CapnProtoEnumComparingMode enum_comparing_mode = CapnProtoEnumComparingMode::BY_VALUES; + bool skip_fields_with_unsupported_types_in_schema_inference = false; + bool use_autogenerated_schema = true; + } capn_proto; + + enum class MsgPackUUIDRepresentation + { + STR, // Output UUID as a string of 36 characters. + BIN, // Output UUID as 16-bytes binary. + EXT, // Output UUID as ExtType = 2 + }; + + struct + { + UInt64 number_of_columns = 0; + MsgPackUUIDRepresentation output_uuid_representation = MsgPackUUIDRepresentation::EXT; + } msgpack; + + struct MySQLDump + { + String table_name; + bool map_column_names = true; + } mysql_dump; + + struct + { + UInt64 max_batch_size = DEFAULT_BLOCK_SIZE; + String table_name = "table"; + bool include_column_names = true; + bool use_replace = false; + bool quote_names = true; + } sql_insert; + + struct + { + bool output_string_as_string; + bool skip_fields_with_unsupported_types_in_schema_inference; + } bson; + + struct + { + bool allow_types_conversion = true; + } native; +}; + +} diff --git a/contrib/clickhouse/src/Formats/IndexForNativeFormat.cpp b/contrib/clickhouse/src/Formats/IndexForNativeFormat.cpp new file mode 100644 index 0000000000..bb41012537 --- /dev/null +++ b/contrib/clickhouse/src/Formats/IndexForNativeFormat.cpp @@ -0,0 +1,91 @@ +#include <Formats/IndexForNativeFormat.h> +#include <IO/ReadHelpers.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_INDEX; +} + +void IndexOfBlockForNativeFormat::read(ReadBuffer & istr) +{ + readVarUInt(num_columns, istr); + readVarUInt(num_rows, istr); + columns.clear(); + for (size_t i = 0; i < num_columns; ++i) + { + auto & column = columns.emplace_back(); + readBinary(column.name, istr); + readBinary(column.type, istr); + readBinaryLittleEndian(column.location.offset_in_compressed_file, istr); + readBinaryLittleEndian(column.location.offset_in_decompressed_block, istr); + } +} + +void IndexOfBlockForNativeFormat::write(WriteBuffer & ostr) const +{ + writeVarUInt(num_columns, ostr); + writeVarUInt(num_rows, ostr); + for (size_t i = 0; i < num_columns; ++i) + { + const auto & column = columns[i]; + writeBinary(column.name, ostr); + writeBinary(column.type, ostr); + writeBinaryLittleEndian(column.location.offset_in_compressed_file, ostr); + writeBinaryLittleEndian(column.location.offset_in_decompressed_block, ostr); + } +} + +IndexOfBlockForNativeFormat IndexOfBlockForNativeFormat::extractIndexForColumns(const NameSet & required_columns) const +{ + if (num_columns < required_columns.size()) + throw Exception(ErrorCodes::INCORRECT_INDEX, "Index contain less than required columns"); + + IndexOfBlockForNativeFormat res; + for (size_t i = 0; i < num_columns; ++i) + { + const auto & column = columns[i]; + if (required_columns.contains(column.name)) + res.columns.push_back(column); + } + + if (res.columns.size() < required_columns.size()) + throw Exception(ErrorCodes::INCORRECT_INDEX, "Index contain less than required columns"); + if (res.columns.size() > required_columns.size()) + throw Exception(ErrorCodes::INCORRECT_INDEX, "Index contain duplicate columns"); + + res.num_columns = res.columns.size(); + res.num_rows = num_rows; + return res; +} + + +void IndexForNativeFormat::read(ReadBuffer & istr) +{ + blocks.clear(); + while (!istr.eof()) + { + auto & block = blocks.emplace_back(); + block.read(istr); + } +} + +void IndexForNativeFormat::write(WriteBuffer & ostr) const +{ + for (const auto & block : blocks) + block.write(ostr); +} + +IndexForNativeFormat IndexForNativeFormat::extractIndexForColumns(const NameSet & required_columns) const +{ + IndexForNativeFormat res; + res.blocks.reserve(blocks.size()); + for (const auto & block : blocks) + res.blocks.emplace_back(block.extractIndexForColumns(required_columns)); + return res; +} + +} diff --git a/contrib/clickhouse/src/Formats/IndexForNativeFormat.h b/contrib/clickhouse/src/Formats/IndexForNativeFormat.h new file mode 100644 index 0000000000..646f539ebd --- /dev/null +++ b/contrib/clickhouse/src/Formats/IndexForNativeFormat.h @@ -0,0 +1,60 @@ +#pragma once + +#include <Core/Names.h> +#include <Formats/MarkInCompressedFile.h> + +namespace DB +{ + +/** The Native format can contain a separately located index, + * which allows you to understand where what column is located, + * and skip unnecessary columns. + */ + +/** The position of one piece of a single column. */ +struct IndexOfOneColumnForNativeFormat +{ + String name; + String type; + MarkInCompressedFile location; +}; + +/** The index for the data block. */ +struct IndexOfBlockForNativeFormat +{ + using Columns = std::vector<IndexOfOneColumnForNativeFormat>; + + size_t num_columns; + size_t num_rows; + Columns columns; + + /// Reads the index for the data block. + void read(ReadBuffer & istr); + + /// Writes the index for the data block. + void write(WriteBuffer & ostr) const; + + /// Returns the index only for the required columns. + IndexOfBlockForNativeFormat extractIndexForColumns(const NameSet & required_columns) const; +}; + +/** The whole index. */ +struct IndexForNativeFormat +{ + using Blocks = std::vector<IndexOfBlockForNativeFormat>; + Blocks blocks; + + bool empty() const { return blocks.empty(); } + void clear() { blocks.clear(); } + + /// Reads the index. + void read(ReadBuffer & istr); + + /// Writes the index. + void write(WriteBuffer & ostr) const; + + /// Returns the index only for the required columns. + IndexForNativeFormat extractIndexForColumns(const NameSet & required_columns) const; +}; + +} diff --git a/contrib/clickhouse/src/Formats/JSONUtils.cpp b/contrib/clickhouse/src/Formats/JSONUtils.cpp new file mode 100644 index 0000000000..6fbda86915 --- /dev/null +++ b/contrib/clickhouse/src/Formats/JSONUtils.cpp @@ -0,0 +1,733 @@ +#include <IO/ReadHelpers.h> +#include <Formats/JSONUtils.h> +#include <Formats/ReadSchemaUtils.h> +#include <Formats/EscapingRuleUtils.h> +#include <IO/ReadBufferFromString.h> +#include <IO/WriteBufferValidUTF8.h> +#include <DataTypes/Serializations/SerializationNullable.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeObject.h> +#include <DataTypes/DataTypeFactory.h> + +#include <base/find_symbols.h> + +#include <Common/logger_useful.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; + extern const int LOGICAL_ERROR; +} + +namespace JSONUtils +{ + + template <const char opening_bracket, const char closing_bracket> + static std::pair<bool, size_t> + fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t min_rows, size_t max_rows) + { + skipWhitespaceIfAny(in); + + char * pos = in.position(); + size_t balance = 0; + bool quotes = false; + size_t number_of_rows = 0; + bool need_more_data = true; + + if (max_rows && (max_rows < min_rows)) + max_rows = min_rows; + + while (loadAtPosition(in, memory, pos) && need_more_data) + { + const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position()); + if (min_bytes != 0 && current_object_size > 10 * min_bytes) + throw ParsingException(ErrorCodes::INCORRECT_DATA, + "Size of JSON object at position {} is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. " + "Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, " + "most likely JSON is malformed", in.count(), min_bytes, current_object_size); + + if (quotes) + { + pos = find_first_symbols<'\\', '"'>(pos, in.buffer().end()); + + if (pos > in.buffer().end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Position in buffer is out of bounds. There must be a bug."); + else if (pos == in.buffer().end()) + continue; + + if (*pos == '\\') + { + ++pos; + if (loadAtPosition(in, memory, pos)) + ++pos; + } + else if (*pos == '"') + { + ++pos; + quotes = false; + } + } + else + { + pos = find_first_symbols<opening_bracket, closing_bracket, '\\', '"'>(pos, in.buffer().end()); + + if (pos > in.buffer().end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Position in buffer is out of bounds. There must be a bug."); + else if (pos == in.buffer().end()) + continue; + + else if (*pos == opening_bracket) + { + ++balance; + ++pos; + } + else if (*pos == closing_bracket) + { + --balance; + ++pos; + } + else if (*pos == '\\') + { + ++pos; + if (loadAtPosition(in, memory, pos)) + ++pos; + } + else if (*pos == '"') + { + quotes = true; + ++pos; + } + + if (balance == 0) + { + ++number_of_rows; + if ((number_of_rows >= min_rows) + && ((memory.size() + static_cast<size_t>(pos - in.position()) >= min_bytes) || (number_of_rows == max_rows))) + need_more_data = false; + } + } + } + + saveUpToPosition(in, memory, pos); + return {loadAtPosition(in, memory, pos), number_of_rows}; + } + + std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows) + { + return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_bytes, 1, max_rows); + } + + std::pair<bool, size_t> + fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t min_rows, size_t max_rows) + { + return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_bytes, min_rows, max_rows); + } + + template <const char opening_bracket, const char closing_bracket> + void skipRowForJSONEachRowImpl(ReadBuffer & in) + { + size_t balance = 0; + bool quotes = false; + while (!in.eof()) + { + if (quotes) + { + auto * pos = find_first_symbols<'\\', '"'>(in.position(), in.buffer().end()); + in.position() = pos; + + if (in.position() > in.buffer().end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Position in buffer is out of bounds. There must be a bug."); + else if (in.position() == in.buffer().end()) + continue; + + if (*in.position() == '\\') + { + ++in.position(); + if (!in.eof()) + ++in.position(); + } + else if (*in.position() == '"') + { + ++in.position(); + quotes = false; + } + } + else + { + auto * pos = find_first_symbols<opening_bracket, closing_bracket, '\\', '"'>(in.position(), in.buffer().end()); + in.position() = pos; + + if (in.position() > in.buffer().end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Position in buffer is out of bounds. There must be a bug."); + else if (in.position() == in.buffer().end()) + continue; + + else if (*in.position() == opening_bracket) + { + ++balance; + ++in.position(); + } + else if (*in.position() == closing_bracket) + { + --balance; + ++in.position(); + } + else if (*in.position() == '\\') + { + ++in.position(); + if (!in.eof()) + ++in.position(); + } + else if (*in.position() == '"') + { + quotes = true; + ++in.position(); + } + + if (balance == 0) + return; + } + } + + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected eof"); + + } + + void skipRowForJSONEachRow(ReadBuffer & in) + { + return skipRowForJSONEachRowImpl<'{', '}'>(in); + } + + void skipRowForJSONCompactEachRow(ReadBuffer & in) + { + return skipRowForJSONEachRowImpl<'[', ']'>(in); + } + + NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info) + { + skipWhitespaceIfAny(in); + assertChar('{', in); + skipWhitespaceIfAny(in); + bool first = true; + NamesAndTypesList names_and_types; + String field; + while (!in.eof() && *in.position() != '}') + { + if (!first) + assertChar(',', in); + else + first = false; + + auto name = readFieldName(in); + auto type = tryInferDataTypeForSingleJSONField(in, settings, inference_info); + names_and_types.emplace_back(name, type); + skipWhitespaceIfAny(in); + } + + if (in.eof()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON object"); + + assertChar('}', in); + return names_and_types; + } + + DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info) + { + skipWhitespaceIfAny(in); + assertChar('[', in); + skipWhitespaceIfAny(in); + bool first = true; + DataTypes types; + String field; + while (!in.eof() && *in.position() != ']') + { + if (!first) + assertChar(',', in); + else + first = false; + auto type = tryInferDataTypeForSingleJSONField(in, settings, inference_info); + types.push_back(std::move(type)); + skipWhitespaceIfAny(in); + } + + if (in.eof()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON array"); + + assertChar(']', in); + return types; + } + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) + { + /// For JSONEachRow we can safely skip whitespace characters + skipWhitespaceIfAny(buf); + return buf.eof() || *buf.position() == '['; + } + + bool readField( + ReadBuffer & in, + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + const String & column_name, + const FormatSettings & format_settings, + bool yield_strings) + { + try + { + bool as_nullable = format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type); + + if (yield_strings) + { + String str; + readJSONString(str, in); + + ReadBufferFromString buf(str); + + if (as_nullable) + return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); + + serialization->deserializeWholeText(column, buf, format_settings); + return true; + } + + if (as_nullable) + return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); + + serialization->deserializeTextJSON(column, in, format_settings); + return true; + } + catch (Exception & e) + { + e.addMessage("(while reading the value of key " + column_name + ")"); + throw; + } + } + + void writeFieldDelimiter(WriteBuffer & out, size_t new_lines) + { + writeChar(',', out); + writeChar('\n', new_lines, out); + } + + void writeFieldCompactDelimiter(WriteBuffer & out) { writeCString(", ", out); } + + void writeTitle(const char * title, WriteBuffer & out, size_t indent, const char * after_delimiter) + { + writeChar('\t', indent, out); + writeChar('"', out); + writeCString(title, out); + writeCString("\":", out); + writeCString(after_delimiter, out); + } + + void writeTitlePretty(const char * title, WriteBuffer & out, size_t indent, const char * after_delimiter) + { + writeChar(' ', indent * 4, out); + writeChar('"', out); + writeCString(title, out); + writeCString("\": ", out); + writeCString(after_delimiter, out); + } + + void writeObjectStart(WriteBuffer & out, size_t indent, const char * title) + { + if (title) + writeTitle(title, out, indent, "\n"); + writeChar('\t', indent, out); + writeCString("{\n", out); + } + + void writeCompactObjectStart(WriteBuffer & out, size_t indent, const char * title) + { + if (title) + writeTitle(title, out, indent, " "); + writeCString("{", out); + } + + void writeCompactObjectEnd(WriteBuffer & out) + { + writeChar('}', out); + } + + void writeObjectEnd(WriteBuffer & out, size_t indent) + { + writeChar('\n', out); + writeChar('\t', indent, out); + writeChar('}', out); + } + + void writeArrayStart(WriteBuffer & out, size_t indent, const char * title) + { + if (title) + writeTitle(title, out, indent, "\n"); + writeChar('\t', indent, out); + writeCString("[\n", out); + } + + void writeCompactArrayStart(WriteBuffer & out, size_t indent, const char * title) + { + if (title) + writeTitle(title, out, indent, " "); + else + writeChar('\t', indent, out); + writeCString("[", out); + } + + void writeArrayEnd(WriteBuffer & out, size_t indent) + { + writeChar('\n', out); + writeChar('\t', indent, out); + writeChar(']', out); + } + + void writeCompactArrayEnd(WriteBuffer & out) { writeChar(']', out); } + + void writeFieldFromColumn( + const IColumn & column, + const ISerialization & serialization, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + const std::optional<String> & name, + size_t indent, + const char * title_after_delimiter, + bool pretty_json) + { + if (name.has_value()) + { + if (pretty_json) + { + writeTitlePretty(name->data(), out, indent, title_after_delimiter); + } + else + { + writeTitle(name->data(), out, indent, title_after_delimiter); + } + } + + if (yield_strings) + { + WriteBufferFromOwnString buf; + + serialization.serializeText(column, row_num, buf, settings); + writeJSONString(buf.str(), out, settings); + } + else + { + if (pretty_json) + { + serialization.serializeTextJSONPretty(column, row_num, out, settings, indent); + } + else + { + serialization.serializeTextJSON(column, row_num, out, settings); + } + } + } + + void writeColumns( + const Columns & columns, + const Names & names, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + size_t indent) + { + for (size_t i = 0; i < columns.size(); ++i) + { + if (i != 0) + writeFieldDelimiter(out); + writeFieldFromColumn(*columns[i], *serializations[i], row_num, yield_strings, settings, out, names[i], indent); + } + } + + void writeCompactColumns( + const Columns & columns, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out) + { + for (size_t i = 0; i < columns.size(); ++i) + { + if (i != 0) + writeFieldCompactDelimiter(out); + writeFieldFromColumn(*columns[i], *serializations[i], row_num, yield_strings, settings, out); + } + } + + void writeMetadata(const Names & names, const DataTypes & types, const FormatSettings & settings, WriteBuffer & out) + { + writeArrayStart(out, 1, "meta"); + + for (size_t i = 0; i < names.size(); ++i) + { + writeObjectStart(out, 2); + + writeTitle("name", out, 3, " "); + + /// The field names are pre-escaped to be put into JSON string literal. + writeChar('"', out); + writeString(names[i], out); + writeChar('"', out); + + writeFieldDelimiter(out); + writeTitle("type", out, 3, " "); + writeJSONString(types[i]->getName(), out, settings); + writeObjectEnd(out, 2); + + if (i + 1 < names.size()) + writeFieldDelimiter(out); + } + + writeArrayEnd(out, 1); + } + + void writeAdditionalInfo( + size_t rows, + size_t rows_before_limit, + bool applied_limit, + const Stopwatch & watch, + const Progress & progress, + bool write_statistics, + WriteBuffer & out) + { + writeFieldDelimiter(out, 2); + writeTitle("rows", out, 1, " "); + writeIntText(rows, out); + + if (applied_limit) + { + writeFieldDelimiter(out, 2); + writeTitle("rows_before_limit_at_least", out, 1, " "); + writeIntText(rows_before_limit, out); + } + + if (write_statistics) + { + writeFieldDelimiter(out, 2); + writeObjectStart(out, 1, "statistics"); + + writeTitle("elapsed", out, 2, " "); + writeText(watch.elapsedSeconds(), out); + writeFieldDelimiter(out); + + writeTitle("rows_read", out, 2, " "); + writeText(progress.read_rows.load(), out); + writeFieldDelimiter(out); + + writeTitle("bytes_read", out, 2, " "); + writeText(progress.read_bytes.load(), out); + + writeObjectEnd(out, 1); + } + } + + Strings makeNamesValidJSONStrings(const Strings & names, const FormatSettings & settings, bool validate_utf8) + { + Strings result; + result.reserve(names.size()); + for (const auto & name : names) + { + WriteBufferFromOwnString buf; + if (validate_utf8) + { + WriteBufferValidUTF8 validating_buf(buf); + writeJSONString(name, validating_buf, settings); + } + else + writeJSONString(name, buf, settings); + + result.push_back(buf.str().substr(1, buf.str().size() - 2)); + } + return result; + } + + void skipColon(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + assertChar(':', in); + skipWhitespaceIfAny(in); + } + + String readFieldName(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + String field; + readJSONString(field, in); + skipColon(in); + return field; + } + + String readStringField(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + String value; + readJSONString(value, in); + skipWhitespaceIfAny(in); + return value; + } + + void skipArrayStart(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + assertChar('[', in); + skipWhitespaceIfAny(in); + } + + bool checkAndSkipArrayStart(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + if (!checkChar('[', in)) + return false; + skipWhitespaceIfAny(in); + return true; + } + + void skipArrayEnd(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + assertChar(']', in); + skipWhitespaceIfAny(in); + } + + bool checkAndSkipArrayEnd(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + if (!checkChar(']', in)) + return false; + skipWhitespaceIfAny(in); + return true; + } + + void skipObjectStart(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + assertChar('{', in); + skipWhitespaceIfAny(in); + } + + void skipObjectEnd(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + assertChar('}', in); + skipWhitespaceIfAny(in); + } + + bool checkAndSkipObjectEnd(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + if (!checkChar('}', in)) + return false; + skipWhitespaceIfAny(in); + return true; + } + + void skipComma(ReadBuffer & in) + { + skipWhitespaceIfAny(in); + assertChar(',', in); + skipWhitespaceIfAny(in); + } + + std::pair<String, String> readStringFieldNameAndValue(ReadBuffer & in) + { + auto field_name = readFieldName(in); + auto field_value = readStringField(in); + return {field_name, field_value}; + } + + NameAndTypePair readObjectWithNameAndType(ReadBuffer & in) + { + skipObjectStart(in); + auto [first_field_name, first_field_value] = readStringFieldNameAndValue(in); + skipComma(in); + auto [second_field_name, second_field_value] = readStringFieldNameAndValue(in); + + NameAndTypePair name_and_type; + if (first_field_name == "name" && second_field_name == "type") + name_and_type = {first_field_value, DataTypeFactory::instance().get(second_field_value)}; + else if (second_field_name == "name" && first_field_name == "type") + name_and_type = {second_field_value, DataTypeFactory::instance().get(first_field_value)}; + else + throw Exception( + ErrorCodes::INCORRECT_DATA, + R"(Expected two fields "name" and "type" with column name and type, found fields "{}" and "{}")", + first_field_name, + second_field_name); + skipObjectEnd(in); + return name_and_type; + } + + NamesAndTypesList readMetadata(ReadBuffer & in) + { + auto field_name = readFieldName(in); + if (field_name != "meta") + throw Exception(ErrorCodes::INCORRECT_DATA, "Expected field \"meta\" with columns names and types, found field {}", field_name); + skipArrayStart(in); + NamesAndTypesList names_and_types; + bool first = true; + while (!checkAndSkipArrayEnd(in)) + { + if (!first) + skipComma(in); + else + first = false; + + names_and_types.push_back(readObjectWithNameAndType(in)); + } + return names_and_types; + } + + NamesAndTypesList readMetadataAndValidateHeader(ReadBuffer & in, const Block & header) + { + auto names_and_types = JSONUtils::readMetadata(in); + for (const auto & [name, type] : names_and_types) + { + if (!header.has(name)) + continue; + + auto header_type = header.getByName(name).type; + if (!type->equals(*header_type)) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Type {} of column '{}' from metadata is not the same as type in header {}", + type->getName(), name, header_type->getName()); + } + return names_and_types; + } + + bool skipUntilFieldInObject(ReadBuffer & in, const String & desired_field_name) + { + while (!checkAndSkipObjectEnd(in)) + { + auto field_name = JSONUtils::readFieldName(in); + if (field_name == desired_field_name) + return true; + } + + return false; + } + + void skipTheRestOfObject(ReadBuffer & in) + { + while (!checkAndSkipObjectEnd(in)) + { + skipComma(in); + auto name = readFieldName(in); + skipWhitespaceIfAny(in); + skipJSONField(in, name); + } + } + +} + +} diff --git a/contrib/clickhouse/src/Formats/JSONUtils.h b/contrib/clickhouse/src/Formats/JSONUtils.h new file mode 100644 index 0000000000..bd56eb646c --- /dev/null +++ b/contrib/clickhouse/src/Formats/JSONUtils.h @@ -0,0 +1,132 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <DataTypes/Serializations/ISerialization.h> +#include <Formats/FormatSettings.h> +#include <IO/BufferWithOwnMemory.h> +#include <IO/ReadBuffer.h> +#include <IO/Progress.h> +#include <Core/NamesAndTypes.h> +#include <Common/Stopwatch.h> +#include <utility> + +namespace DB +{ + +struct JSONInferenceInfo; + +namespace JSONUtils +{ + std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows); + std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t min_rows, size_t max_rows); + + void skipRowForJSONEachRow(ReadBuffer & in); + void skipRowForJSONCompactEachRow(ReadBuffer & in); + + /// Read row in JSONEachRow format and try to determine type for each field. + /// Return list of names and types. + /// If cannot determine the type of some field, return nullptr for it. + NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info); + + /// Read row in JSONCompactEachRow format and try to determine type for each field. + /// If cannot determine the type of some field, return nullptr for it. + DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info); + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); + + bool readField( + ReadBuffer & in, + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + const String & column_name, + const FormatSettings & format_settings, + bool yield_strings); + + Strings makeNamesValidJSONStrings(const Strings & names, const FormatSettings & settings, bool validate_utf8); + + /// Functions helpers for writing JSON data to WriteBuffer. + + void writeFieldDelimiter(WriteBuffer & out, size_t new_lines = 1); + + void writeFieldCompactDelimiter(WriteBuffer & out); + + void writeObjectStart(WriteBuffer & out, size_t indent = 0, const char * title = nullptr); + + void writeCompactObjectStart(WriteBuffer & out, size_t indent = 0, const char * title = nullptr); + + void writeObjectEnd(WriteBuffer & out, size_t indent = 0); + + void writeCompactObjectEnd(WriteBuffer & out); + + void writeArrayStart(WriteBuffer & out, size_t indent = 0, const char * title = nullptr); + + void writeCompactArrayStart(WriteBuffer & out, size_t indent = 0, const char * title = nullptr); + + void writeArrayEnd(WriteBuffer & out, size_t indent = 0); + + void writeCompactArrayEnd(WriteBuffer & out); + + void writeFieldFromColumn( + const IColumn & column, + const ISerialization & serialization, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + const std::optional<String> & name = std::nullopt, + size_t indent = 0, + const char * title_after_delimiter = " ", + bool pretty_json = false); + + void writeColumns( + const Columns & columns, + const Names & names, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + size_t indent = 0); + + void writeCompactColumns( + const Columns & columns, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out); + + void writeMetadata(const Names & names, const DataTypes & types, const FormatSettings & settings, WriteBuffer & out); + + void writeAdditionalInfo( + size_t rows, + size_t rows_before_limit, + bool applied_limit, + const Stopwatch & watch, + const Progress & progress, + bool write_statistics, + WriteBuffer & out); + + void skipColon(ReadBuffer & in); + void skipComma(ReadBuffer & in); + + String readFieldName(ReadBuffer & in); + + void skipArrayStart(ReadBuffer & in); + void skipArrayEnd(ReadBuffer & in); + bool checkAndSkipArrayStart(ReadBuffer & in); + bool checkAndSkipArrayEnd(ReadBuffer & in); + + void skipObjectStart(ReadBuffer & in); + void skipObjectEnd(ReadBuffer & in); + bool checkAndSkipObjectEnd(ReadBuffer & in); + + NamesAndTypesList readMetadata(ReadBuffer & in); + NamesAndTypesList readMetadataAndValidateHeader(ReadBuffer & in, const Block & header); + + bool skipUntilFieldInObject(ReadBuffer & in, const String & desired_field_name); + void skipTheRestOfObject(ReadBuffer & in); +} + +} diff --git a/contrib/clickhouse/src/Formats/MarkInCompressedFile.cpp b/contrib/clickhouse/src/Formats/MarkInCompressedFile.cpp new file mode 100644 index 0000000000..41f6152dc1 --- /dev/null +++ b/contrib/clickhouse/src/Formats/MarkInCompressedFile.cpp @@ -0,0 +1,103 @@ +#include <Formats/MarkInCompressedFile.h> + +#include <Common/BitHelpers.h> + +namespace DB +{ + +// Write a range of bits in a bit-packed array. +// The array must be overallocated by one element. +// The bit range must be pre-filled with zeros. +void writeBits(UInt64 * dest, size_t bit_offset, UInt64 value) +{ + size_t mod = bit_offset % 64; + dest[bit_offset / 64] |= value << mod; + if (mod) + dest[bit_offset / 64 + 1] |= value >> (64 - mod); +} + +// The array must be overallocated by one element. +UInt64 readBits(const UInt64 * src, size_t bit_offset, size_t num_bits) +{ + size_t mod = bit_offset % 64; + UInt64 value = src[bit_offset / 64] >> mod; + if (mod) + value |= src[bit_offset / 64 + 1] << (64 - mod); + return value & maskLowBits<UInt64>(num_bits); +} + +MarksInCompressedFile::MarksInCompressedFile(const PlainArray & marks) + : num_marks(marks.size()), blocks((marks.size() + MARKS_PER_BLOCK - 1) / MARKS_PER_BLOCK, BlockInfo{}) +{ + if (num_marks == 0) + { + return; + } + + // First pass: calculate layout of all blocks and total memory required. + size_t packed_bits = 0; + for (size_t block_idx = 0; block_idx < blocks.size(); ++block_idx) + { + BlockInfo & block = blocks[block_idx]; + block.bit_offset_in_packed_array = packed_bits; + + size_t max_x = 0; + size_t max_y = 0; + size_t num_marks_in_this_block = std::min(MARKS_PER_BLOCK, num_marks - block_idx * MARKS_PER_BLOCK); + for (size_t i = 0; i < num_marks_in_this_block; ++i) + { + const auto & mark = marks[block_idx * MARKS_PER_BLOCK + i]; + block.min_x = std::min(block.min_x, mark.offset_in_compressed_file); + max_x = std::max(max_x, mark.offset_in_compressed_file); + block.min_y = std::min(block.min_y, mark.offset_in_decompressed_block); + max_y = std::max(max_y, mark.offset_in_decompressed_block); + + block.trailing_zero_bits_in_y + = std::min(block.trailing_zero_bits_in_y, static_cast<UInt8>(getTrailingZeroBits(mark.offset_in_decompressed_block))); + } + + block.bits_for_x = sizeof(size_t) * 8 - getLeadingZeroBits(max_x - block.min_x); + block.bits_for_y = sizeof(size_t) * 8 - getLeadingZeroBits((max_y - block.min_y) >> block.trailing_zero_bits_in_y); + packed_bits += num_marks_in_this_block * (block.bits_for_x + block.bits_for_y); + } + + // Overallocate by +1 element to let the bit packing/unpacking do less bounds checking. + size_t packed_length = (packed_bits + 63) / 64 + 1; + packed.reserve_exact(packed_length); + packed.resize_fill(packed_length); + + // Second pass: write out the packed marks. + for (size_t idx = 0; idx < num_marks; ++idx) + { + const auto & mark = marks[idx]; + auto [block, offset] = lookUpMark(idx); + writeBits(packed.data(), offset, mark.offset_in_compressed_file - block->min_x); + writeBits( + packed.data(), + offset + block->bits_for_x, + (mark.offset_in_decompressed_block - block->min_y) >> block->trailing_zero_bits_in_y); + } +} + +MarkInCompressedFile MarksInCompressedFile::get(size_t idx) const +{ + auto [block, offset] = lookUpMark(idx); + size_t x = block->min_x + readBits(packed.data(), offset, block->bits_for_x); + size_t y = block->min_y + (readBits(packed.data(), offset + block->bits_for_x, block->bits_for_y) << block->trailing_zero_bits_in_y); + return MarkInCompressedFile{.offset_in_compressed_file = x, .offset_in_decompressed_block = y}; +} + +std::tuple<const MarksInCompressedFile::BlockInfo *, size_t> MarksInCompressedFile::lookUpMark(size_t idx) const +{ + size_t block_idx = idx / MARKS_PER_BLOCK; + const BlockInfo & block = blocks[block_idx]; + size_t offset = block.bit_offset_in_packed_array + (idx - block_idx * MARKS_PER_BLOCK) * (block.bits_for_x + block.bits_for_y); + return {&block, offset}; +} + +size_t MarksInCompressedFile::approximateMemoryUsage() const +{ + return sizeof(*this) + blocks.size() * sizeof(blocks[0]) + packed.size() * sizeof(packed[0]); +} + +} diff --git a/contrib/clickhouse/src/Formats/MarkInCompressedFile.h b/contrib/clickhouse/src/Formats/MarkInCompressedFile.h new file mode 100644 index 0000000000..08e4f182c4 --- /dev/null +++ b/contrib/clickhouse/src/Formats/MarkInCompressedFile.h @@ -0,0 +1,120 @@ +#pragma once + +#include <tuple> + +#include <IO/WriteHelpers.h> +#include <base/types.h> +#include <Common/PODArray.h> + + +namespace DB +{ + +/** Mark is the position in the compressed file. The compressed file consists of adjacent compressed blocks. + * Mark is a tuple - the offset in the file to the start of the compressed block, the offset in the decompressed block to the start of the data. + */ +struct MarkInCompressedFile +{ + size_t offset_in_compressed_file; + size_t offset_in_decompressed_block; + + bool operator==(const MarkInCompressedFile & rhs) const + { + return std::tie(offset_in_compressed_file, offset_in_decompressed_block) + == std::tie(rhs.offset_in_compressed_file, rhs.offset_in_decompressed_block); + } + bool operator!=(const MarkInCompressedFile & rhs) const { return !(*this == rhs); } + + auto asTuple() const { return std::make_tuple(offset_in_compressed_file, offset_in_decompressed_block); } + + String toString() const + { + return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + ")"; + } + + String toStringWithRows(size_t rows_num) const + { + return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + "," + + DB::toString(rows_num) + ")"; + } +}; + +/** + * In-memory representation of an array of marks. + * + * Uses an ad-hoc compression scheme that decreases memory usage while allowing + * random access in O(1) time. + * This is independent from the marks *file* format, which may be uncompressed + * or use a different compression method. + * + * Typical memory usage: + * * ~3 bytes/mark for integer columns + * * ~5 bytes/mark for string columns + * * ~0.3 bytes/mark for trivial marks in auxiliary dict files of LowCardinality columns + */ +class MarksInCompressedFile +{ +public: + using PlainArray = PODArray<MarkInCompressedFile>; + + MarksInCompressedFile(const PlainArray & marks); + + MarkInCompressedFile get(size_t idx) const; + + size_t approximateMemoryUsage() const; + +private: + /** Throughout this class: + * * "x" stands for offset_in_compressed_file, + * * "y" stands for offset_in_decompressed_block. + */ + + /** We need to store a sequence of marks, each consisting of two 64-bit integers: + * offset_in_compressed_file and offset_in_decompressed_block. We'll call them x and y for + * convenience, since compression doesn't care what they mean. The compression exploits the + * following regularities: + * * y is usually zero. + * * x usually increases steadily. + * * Differences between x values in nearby marks usually fit in much fewer than 64 bits. + * + * We split the sequence of marks into blocks, each containing MARKS_PER_BLOCK marks. + * (Not to be confused with data blocks.) + * For each mark, we store the difference [value] - [min value in the block], for each of the + * two values in the mark. Each block specifies the number of bits to use for these differences + * for all marks in this block. + * The smaller the blocks the fewer bits are required, but the bigger the relative overhead of + * block headers. + * + * Packed marks and block headers all live in one contiguous array. + */ + + struct BlockInfo + { + // Min offset_in_compressed_file and offset_in_decompressed_block, correspondingly. + size_t min_x = UINT64_MAX; + size_t min_y = UINT64_MAX; + + // Place in `packed` where this block start. + size_t bit_offset_in_packed_array; + + // How many bits each mark takes. These numbers are bit-packed in the `packed` array. + // Can be zero. (Especially for y, which is typically all zeroes.) + UInt8 bits_for_x; + UInt8 bits_for_y; + // The `y` values should be <<'ed by this amount. + // Useful for integer columns when marks granularity is a power of 2; in this case all + // offset_in_decompressed_block values are divisible by 2^15 or so. + UInt8 trailing_zero_bits_in_y = 63; + }; + + static constexpr size_t MARKS_PER_BLOCK = 256; + + size_t num_marks; + PODArray<BlockInfo> blocks; + PODArray<UInt64> packed; + + // Mark idx -> {block info, bit offset in `packed`}. + std::tuple<const BlockInfo *, size_t> lookUpMark(size_t idx) const; +}; + +} diff --git a/contrib/clickhouse/src/Formats/MsgPackExtensionTypes.h b/contrib/clickhouse/src/Formats/MsgPackExtensionTypes.h new file mode 100644 index 0000000000..2f7d28eb5b --- /dev/null +++ b/contrib/clickhouse/src/Formats/MsgPackExtensionTypes.h @@ -0,0 +1,11 @@ +#pragma once + +namespace DB +{ + +enum class MsgPackExtensionTypes +{ + UUIDType = 0x02, +}; + +} diff --git a/contrib/clickhouse/src/Formats/NativeReader.cpp b/contrib/clickhouse/src/Formats/NativeReader.cpp new file mode 100644 index 0000000000..4c25460eb6 --- /dev/null +++ b/contrib/clickhouse/src/Formats/NativeReader.cpp @@ -0,0 +1,317 @@ +#include <Core/Defines.h> +#include <Core/ProtocolDefines.h> + +#include <IO/ReadHelpers.h> +#include <IO/VarInt.h> +#include <Compression/CompressedReadBufferFromFile.h> + +#include <DataTypes/DataTypeFactory.h> +#include <Common/typeid_cast.h> +#include <base/range.h> + +#include <Formats/NativeReader.h> +#include <Formats/insertNullAsDefaultIfNeeded.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/Serializations/SerializationInfo.h> +#include <DataTypes/DataTypeAggregateFunction.h> + +#include <Interpreters/castColumn.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_INDEX; + extern const int LOGICAL_ERROR; + extern const int CANNOT_READ_ALL_DATA; + extern const int INCORRECT_DATA; + extern const int TOO_LARGE_ARRAY_SIZE; +} + + +NativeReader::NativeReader(ReadBuffer & istr_, UInt64 server_revision_) + : istr(istr_), server_revision(server_revision_) +{ +} + +NativeReader::NativeReader( + ReadBuffer & istr_, + const Block & header_, + UInt64 server_revision_, + bool skip_unknown_columns_, + bool null_as_default_, + bool allow_types_conversion_, + BlockMissingValues * block_missing_values_) + : istr(istr_) + , header(header_) + , server_revision(server_revision_) + , skip_unknown_columns(skip_unknown_columns_) + , null_as_default(null_as_default_) + , allow_types_conversion(allow_types_conversion_) + , block_missing_values(block_missing_values_) +{ +} + +NativeReader::NativeReader(ReadBuffer & istr_, UInt64 server_revision_, + IndexForNativeFormat::Blocks::const_iterator index_block_it_, + IndexForNativeFormat::Blocks::const_iterator index_block_end_) + : istr(istr_), server_revision(server_revision_), + use_index(true), index_block_it(index_block_it_), index_block_end(index_block_end_) +{ + istr_concrete = typeid_cast<CompressedReadBufferFromFile *>(&istr); + if (!istr_concrete) + throw Exception(ErrorCodes::LOGICAL_ERROR, "When need to use index for NativeReader, istr must be CompressedReadBufferFromFile."); + + if (index_block_it == index_block_end) + return; + + index_column_it = index_block_it->columns.begin(); + + /// Initialize header from the index. + for (const auto & column : index_block_it->columns) + { + auto type = DataTypeFactory::instance().get(column.type); + header.insert(ColumnWithTypeAndName{ type, column.name }); + } +} + +void NativeReader::resetParser() +{ + istr_concrete = nullptr; + use_index = false; +} + +void NativeReader::readData(const ISerialization & serialization, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint) +{ + ISerialization::DeserializeBinaryBulkSettings settings; + settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; + settings.avg_value_size_hint = avg_value_size_hint; + settings.position_independent_encoding = false; + settings.native_format = true; + + ISerialization::DeserializeBinaryBulkStatePtr state; + + serialization.deserializeBinaryBulkStatePrefix(settings, state); + serialization.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state, nullptr); + + if (column->size() != rows) + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, + "Cannot read all data in NativeReader. Rows read: {}. Rows expected: {}", column->size(), rows); +} + + +Block NativeReader::getHeader() const +{ + return header; +} + + +Block NativeReader::read() +{ + Block res; + + const DataTypeFactory & data_type_factory = DataTypeFactory::instance(); + + if (use_index && index_block_it == index_block_end) + return res; + + if (istr.eof()) + { + if (use_index) + throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Input doesn't contain all data for index."); + + return res; + } + + /// Additional information about the block. + if (server_revision > 0) + res.info.read(istr); + + /// Dimensions + size_t columns = 0; + size_t rows = 0; + + if (!use_index) + { + readVarUInt(columns, istr); + readVarUInt(rows, istr); + + if (columns > 1'000'000uz) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Suspiciously many columns in Native format: {}", columns); + if (rows > 1'000'000'000'000uz) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Suspiciously many rows in Native format: {}", rows); + } + else + { + columns = index_block_it->num_columns; + rows = index_block_it->num_rows; + } + + if (columns == 0 && !header && rows != 0) + throw Exception(ErrorCodes::INCORRECT_DATA, "Zero columns but {} rows in Native format.", rows); + + for (size_t i = 0; i < columns; ++i) + { + if (use_index) + { + /// If the current position is what is required, the real seek does not occur. + istr_concrete->seek(index_column_it->location.offset_in_compressed_file, index_column_it->location.offset_in_decompressed_block); + } + + ColumnWithTypeAndName column; + + /// Name + readBinary(column.name, istr); + + /// Type + String type_name; + readBinary(type_name, istr); + column.type = data_type_factory.get(type_name); + + setVersionToAggregateFunctions(column.type, true, server_revision); + + SerializationPtr serialization; + if (server_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION) + { + auto info = column.type->createSerializationInfo({}); + + UInt8 has_custom; + readBinary(has_custom, istr); + if (has_custom) + info->deserializeFromKindsBinary(istr); + + serialization = column.type->getSerialization(*info); + } + else + { + serialization = column.type->getDefaultSerialization(); + } + + if (use_index) + { + /// Index allows to do more checks. + if (index_column_it->name != column.name) + throw Exception(ErrorCodes::INCORRECT_INDEX, "Index points to column with wrong name: corrupted index or data"); + if (index_column_it->type != type_name) + throw Exception(ErrorCodes::INCORRECT_INDEX, "Index points to column with wrong type: corrupted index or data"); + } + + /// Data + ColumnPtr read_column = column.type->createColumn(*serialization); + + double avg_value_size_hint = avg_value_size_hints.empty() ? 0 : avg_value_size_hints[i]; + if (rows) /// If no rows, nothing to read. + readData(*serialization, read_column, istr, rows, avg_value_size_hint); + + column.column = std::move(read_column); + + bool use_in_result = true; + if (header) + { + if (header.has(column.name)) + { + auto & header_column = header.getByName(column.name); + + if (null_as_default) + insertNullAsDefaultIfNeeded(column, header_column, header.getPositionByName(column.name), block_missing_values); + + if (!header_column.type->equals(*column.type)) + { + if (allow_types_conversion) + { + try + { + column.column = castColumn(column, header_column.type); + } + catch (Exception & e) + { + e.addMessage(fmt::format( + "while converting column \"{}\" from type {} to type {}", + column.name, + column.type->getName(), + header_column.type->getName())); + throw; + } + } + else + { + /// Support insert from old clients without low cardinality type. + column.column = recursiveLowCardinalityTypeConversion(column.column, column.type, header_column.type); + } + + column.type = header_column.type; + } + } + else + { + if (!skip_unknown_columns) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column with name {} found while reading data in Native format", column.name); + use_in_result = false; + } + } + + if (use_in_result) + res.insert(std::move(column)); + + if (use_index) + ++index_column_it; + } + + if (use_index) + { + if (index_column_it != index_block_it->columns.end()) + throw Exception(ErrorCodes::INCORRECT_INDEX, "Inconsistent index: not all columns were read"); + + ++index_block_it; + if (index_block_it != index_block_end) + index_column_it = index_block_it->columns.begin(); + } + + if (rows && header) + { + /// Allow to skip columns. Fill them with default values. + Block tmp_res; + + for (size_t column_i = 0; column_i != header.columns(); ++column_i) + { + auto & col = header.getByPosition(column_i); + if (res.has(col.name)) + { + tmp_res.insert(res.getByName(col.name)); + } + else + { + tmp_res.insert({col.type->createColumn()->cloneResized(rows), col.type, col.name}); + if (block_missing_values) + block_missing_values->setBits(column_i, rows); + } + } + tmp_res.info = res.info; + + res.swap(tmp_res); + } + + if (res.rows() != rows) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Row count mismatch after desirialization, got: {}, expected: {}", res.rows(), rows); + + return res; +} + +void NativeReader::updateAvgValueSizeHints(const Block & block) +{ + auto rows = block.rows(); + if (rows < 10) + return; + + avg_value_size_hints.resize_fill(block.columns(), 0); + + for (auto idx : collections::range(0, block.columns())) + { + auto & avg_value_size_hint = avg_value_size_hints[idx]; + IDataType::updateAvgValueSizeHint(*block.getByPosition(idx).column, avg_value_size_hint); + } +} + +} diff --git a/contrib/clickhouse/src/Formats/NativeReader.h b/contrib/clickhouse/src/Formats/NativeReader.h new file mode 100644 index 0000000000..3cec4afd99 --- /dev/null +++ b/contrib/clickhouse/src/Formats/NativeReader.h @@ -0,0 +1,71 @@ +#pragma once + +#include <Formats/IndexForNativeFormat.h> +#include <Formats/MarkInCompressedFile.h> +#include <Common/PODArray.h> +#include <Core/Block.h> + +namespace DB +{ + +class CompressedReadBufferFromFile; + +/** Deserializes the stream of blocks from the native binary format (with names and column types). + * Designed for communication between servers. + * + * Can also be used to store data on disk. + * In this case, can use the index. + */ +class NativeReader +{ +public: + /// If a non-zero server_revision is specified, additional block information may be expected and read. + NativeReader(ReadBuffer & istr_, UInt64 server_revision_); + + /// For cases when data structure (header) is known in advance. + /// NOTE We may use header for data validation and/or type conversions. It is not implemented. + NativeReader( + ReadBuffer & istr_, + const Block & header_, + UInt64 server_revision_, + bool skip_unknown_columns_ = false, + bool null_as_default_ = false, + bool allow_types_conversion_ = false, + BlockMissingValues * block_missing_values_ = nullptr); + + /// For cases when we have an index. It allows to skip columns. Only columns specified in the index will be read. + NativeReader(ReadBuffer & istr_, UInt64 server_revision_, + IndexForNativeFormat::Blocks::const_iterator index_block_it_, + IndexForNativeFormat::Blocks::const_iterator index_block_end_); + + static void readData(const ISerialization & serialization, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint); + + Block getHeader() const; + + void resetParser(); + + Block read(); + +private: + ReadBuffer & istr; + Block header; + UInt64 server_revision; + bool skip_unknown_columns = false; + bool null_as_default = false; + bool allow_types_conversion = false; + BlockMissingValues * block_missing_values = nullptr; + + bool use_index = false; + IndexForNativeFormat::Blocks::const_iterator index_block_it; + IndexForNativeFormat::Blocks::const_iterator index_block_end; + IndexOfBlockForNativeFormat::Columns::const_iterator index_column_it; + + /// If an index is specified, then `istr` must be CompressedReadBufferFromFile. Unused otherwise. + CompressedReadBufferFromFile * istr_concrete = nullptr; + + PODArray<double> avg_value_size_hints; + + void updateAvgValueSizeHints(const Block & block); +}; + +} diff --git a/contrib/clickhouse/src/Formats/NativeWriter.cpp b/contrib/clickhouse/src/Formats/NativeWriter.cpp new file mode 100644 index 0000000000..70d5b7914a --- /dev/null +++ b/contrib/clickhouse/src/Formats/NativeWriter.cpp @@ -0,0 +1,182 @@ +#include <Core/ProtocolDefines.h> +#include <Core/Block.h> + +#include <IO/WriteHelpers.h> +#include <IO/VarInt.h> +#include <Compression/CompressedWriteBuffer.h> +#include <DataTypes/Serializations/SerializationInfo.h> + +#include <Formats/IndexForNativeFormat.h> +#include <Formats/MarkInCompressedFile.h> +#include <Formats/NativeWriter.h> + +#include <Common/typeid_cast.h> +#include <Columns/ColumnSparse.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeAggregateFunction.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + + +NativeWriter::NativeWriter( + WriteBuffer & ostr_, UInt64 client_revision_, const Block & header_, bool remove_low_cardinality_, + IndexForNativeFormat * index_, size_t initial_size_of_file_) + : ostr(ostr_), client_revision(client_revision_), header(header_), + index(index_), initial_size_of_file(initial_size_of_file_), remove_low_cardinality(remove_low_cardinality_) +{ + if (index) + { + ostr_concrete = typeid_cast<CompressedWriteBuffer *>(&ostr); + if (!ostr_concrete) + throw Exception(ErrorCodes::LOGICAL_ERROR, "When need to write index for NativeWriter, ostr must be CompressedWriteBuffer."); + } +} + + +void NativeWriter::flush() +{ + ostr.next(); +} + + +static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) +{ + /** If there are columns-constants - then we materialize them. + * (Since the data type does not know how to serialize / deserialize constants.) + */ + ColumnPtr full_column = column->convertToFullColumnIfConst(); + + ISerialization::SerializeBinaryBulkSettings settings; + settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; }; + settings.position_independent_encoding = false; + settings.low_cardinality_max_dictionary_size = 0; + + ISerialization::SerializeBinaryBulkStatePtr state; + serialization.serializeBinaryBulkStatePrefix(*full_column, settings, state); + serialization.serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state); + serialization.serializeBinaryBulkStateSuffix(settings, state); +} + + +size_t NativeWriter::write(const Block & block) +{ + size_t written_before = ostr.count(); + + /// Additional information about the block. + if (client_revision > 0) + block.info.write(ostr); + + block.checkNumberOfRows(); + + /// Dimensions + size_t columns = block.columns(); + size_t rows = block.rows(); + + writeVarUInt(columns, ostr); + writeVarUInt(rows, ostr); + + /** The index has the same structure as the data stream. + * But instead of column values, it contains a mark that points to the location in the data file where this part of the column is located. + */ + IndexOfBlockForNativeFormat index_block; + if (index) + { + index_block.num_columns = columns; + index_block.num_rows = rows; + index_block.columns.resize(columns); + } + + for (size_t i = 0; i < columns; ++i) + { + /// For the index. + MarkInCompressedFile mark{0, 0}; + + if (index) + { + ostr_concrete->next(); /// Finish compressed block. + mark.offset_in_compressed_file = initial_size_of_file + ostr_concrete->getCompressedBytes(); + mark.offset_in_decompressed_block = ostr_concrete->getRemainingBytes(); + } + + auto column = block.safeGetByPosition(i); + + /// Send data to old clients without low cardinality type. + if (remove_low_cardinality || (client_revision && client_revision < DBMS_MIN_REVISION_WITH_LOW_CARDINALITY_TYPE)) + { + column.column = recursiveRemoveLowCardinality(column.column); + column.type = recursiveRemoveLowCardinality(column.type); + } + + /// Name + writeStringBinary(column.name, ostr); + + bool include_version = client_revision >= DBMS_MIN_REVISION_WITH_AGGREGATE_FUNCTIONS_VERSIONING; + setVersionToAggregateFunctions(column.type, include_version, include_version ? std::optional<size_t>(client_revision) : std::nullopt); + + /// Type + String type_name = column.type->getName(); + + /// For compatibility, we will not send explicit timezone parameter in DateTime data type + /// to older clients, that cannot understand it. + if (client_revision < DBMS_MIN_REVISION_WITH_TIME_ZONE_PARAMETER_IN_DATETIME_DATA_TYPE + && startsWith(type_name, "DateTime(")) + type_name = "DateTime"; + + writeStringBinary(type_name, ostr); + + /// Serialization. Dynamic, if client supports it. + SerializationPtr serialization; + if (client_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION) + { + auto info = column.type->getSerializationInfo(*column.column); + bool has_custom = false; + + if (client_revision >= DBMS_MIN_REVISION_WITH_SPARSE_SERIALIZATION) + { + serialization = column.type->getSerialization(*info); + has_custom = info->hasCustomSerialization(); + } + else + { + serialization = column.type->getDefaultSerialization(); + column.column = recursiveRemoveSparse(column.column); + } + + writeBinary(static_cast<UInt8>(has_custom), ostr); + if (has_custom) + info->serialializeKindBinary(ostr); + } + else + { + serialization = column.type->getDefaultSerialization(); + column.column = recursiveRemoveSparse(column.column); + } + + /// Data + if (rows) /// Zero items of data is always represented as zero number of bytes. + writeData(*serialization, column.column, ostr, 0, 0); + + if (index) + { + index_block.columns[i].name = column.name; + index_block.columns[i].type = column.type->getName(); + index_block.columns[i].location.offset_in_compressed_file = mark.offset_in_compressed_file; + index_block.columns[i].location.offset_in_decompressed_block = mark.offset_in_decompressed_block; + } + } + + if (index) + index->blocks.emplace_back(std::move(index_block)); + + size_t written_after = ostr.count(); + size_t written_size = written_after - written_before; + return written_size; +} + +} diff --git a/contrib/clickhouse/src/Formats/NativeWriter.h b/contrib/clickhouse/src/Formats/NativeWriter.h new file mode 100644 index 0000000000..7bb377d2e4 --- /dev/null +++ b/contrib/clickhouse/src/Formats/NativeWriter.h @@ -0,0 +1,49 @@ +#pragma once + +#include <base/types.h> +#include <DataTypes/IDataType.h> +#include <Core/Block.h> + +namespace DB +{ + +class WriteBuffer; +class CompressedWriteBuffer; +struct IndexForNativeFormat; + +/** Serializes the stream of blocks in their native binary format (with names and column types). + * Designed for communication between servers. + * + * A stream can be specified to write the index. The index contains offsets to each part of each column. + * If an `append` is made to an existing file, and you need to write the index, then specify `initial_size_of_file`. + */ +class NativeWriter +{ +public: + /** If non-zero client_revision is specified, additional block information can be written. + */ + NativeWriter( + WriteBuffer & ostr_, UInt64 client_revision_, const Block & header_, bool remove_low_cardinality_ = false, + IndexForNativeFormat * index_ = nullptr, size_t initial_size_of_file_ = 0); + + Block getHeader() const { return header; } + + /// Returns the number of bytes written. + size_t write(const Block & block); + void flush(); + + static String getContentType() { return "application/octet-stream"; } + +private: + WriteBuffer & ostr; + UInt64 client_revision; + Block header; + IndexForNativeFormat * index = nullptr; + size_t initial_size_of_file; /// The initial size of the data file, if `append` done. Used for the index. + /// If you need to write index, then `ostr` must be a CompressedWriteBuffer. + CompressedWriteBuffer * ostr_concrete = nullptr; + + bool remove_low_cardinality; +}; + +} diff --git a/contrib/clickhouse/src/Formats/ParsedTemplateFormatString.cpp b/contrib/clickhouse/src/Formats/ParsedTemplateFormatString.cpp new file mode 100644 index 0000000000..df76d9f2cb --- /dev/null +++ b/contrib/clickhouse/src/Formats/ParsedTemplateFormatString.cpp @@ -0,0 +1,193 @@ +#include <Formats/ParsedTemplateFormatString.h> +#include <Formats/verbosePrintString.h> +#include <Formats/EscapingRuleUtils.h> +#include <IO/ReadBufferFromMemory.h> +#include <IO/Operators.h> +#include <IO/ReadBufferFromFile.h> +#include <Interpreters/Context.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_TEMPLATE_FORMAT; +} + +ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes) +{ + ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); + String format_string; + readStringUntilEOF(format_string, schema_file); + try + { + parse(format_string, idx_by_name, allow_indexes); + } + catch (DB::Exception & e) + { + if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT) + throwInvalidFormat(e.message(), columnsCount()); + else + throw; + } +} + + +void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes) +{ + enum ParserState + { + Delimiter, + Column, + Format + }; + + const char * pos = format_string.c_str(); + const char * end = format_string.c_str() + format_string.size(); + const char * token_begin = pos; + ParserState state = Delimiter; + delimiters.emplace_back(); + char * col_idx_end; + std::optional<size_t> column_idx; + for (; *pos; ++pos) + { + switch (state) + { + case Delimiter: + if (*pos == '$') + { + delimiters.back().append(token_begin, pos - token_begin); + ++pos; + if (*pos == '{') + { + token_begin = pos + 1; + state = Column; + } + else if (*pos == '$') + { + token_begin = pos; + } + else + throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + + ": Expected '{' or '$' after '$'" + + ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"", columnsCount()); + } + break; + + case Column: + column_names.emplace_back(); + pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_names.back()); + + if (*pos == ':') + state = Format; + else if (*pos == '}') + { + escaping_rules.push_back(EscapingRule::None); + delimiters.emplace_back(); + state = Delimiter; + } + else + throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + + ": Expected ':' or '}' after column name \"" + column_names.back() + "\"" + + ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"", columnsCount()); + + token_begin = pos + 1; + column_idx.reset(); + if (!column_names.back().empty()) + { + col_idx_end = nullptr; + errno = 0; + column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); + if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) + column_idx = idx_by_name(column_names.back()); + else if (!allow_indexes) + throw Exception(ErrorCodes::INVALID_TEMPLATE_FORMAT, "Indexes instead of names are not allowed"); + } + format_idx_to_column_idx.emplace_back(column_idx); + break; + + case Format: + if (*pos == '}') + { + escaping_rules.push_back(stringToEscapingRule(String(token_begin, pos - token_begin))); + token_begin = pos + 1; + delimiters.emplace_back(); + state = Delimiter; + } + } + } + if (state != Delimiter) + throwInvalidFormat("Unbalanced parentheses", columnsCount()); + delimiters.back().append(token_begin, pos - token_begin); +} + +size_t ParsedTemplateFormatString::columnsCount() const +{ + return format_idx_to_column_idx.size(); +} + +const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s) +{ + s.clear(); + if (!size) + return pos; + ReadBufferFromMemory buf{pos, size}; + if (*pos == '"') + readDoubleQuotedStringWithSQLStyle(s, buf); + else if (*pos == '`') + readBackQuotedStringWithSQLStyle(s, buf); + else if (isWordCharASCII(*pos)) + { + size_t name_size = 1; + while (name_size < size && isWordCharASCII(*(pos + name_size))) + ++name_size; + s = String{pos, name_size}; + return pos + name_size; + } + return pos + buf.count(); +} + +String ParsedTemplateFormatString::dump() const +{ + WriteBufferFromOwnString res; + res << "\nDelimiter " << 0 << ": "; + verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res); + + size_t num_columns = std::max(escaping_rules.size(), format_idx_to_column_idx.size()); + for (size_t i = 0; i < num_columns; ++i) + { + res << "\nColumn " << i << ": \""; + if (column_names.size() <= i) + res << "<ERROR>"; + else if (column_names[i].empty()) + res << "<SKIPPED>"; + else + res << column_names[i]; + + res << "\" (mapped to table column "; + if (format_idx_to_column_idx.size() <= i) + res << "<ERROR>"; + else if (!format_idx_to_column_idx[i]) + res << "<SKIPPED>"; + else + res << *format_idx_to_column_idx[i]; + + res << "), Format " << (i < escaping_rules.size() ? escapingRuleToString(escaping_rules[i]) : "<ERROR>"); + + res << "\nDelimiter " << i + 1 << ": "; + if (delimiters.size() <= i + 1) + res << "<ERROR>"; + else + verbosePrintString(delimiters[i + 1].c_str(), delimiters[i + 1].c_str() + delimiters[i + 1].size(), res); + } + + return res.str(); +} + +void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size_t column) const +{ + throw Exception(ErrorCodes::INVALID_TEMPLATE_FORMAT, "Invalid format string for Template: {} (near column {}). " + "Parsed format string:\n{}\n", message, std::to_string(column), dump()); +} + +} diff --git a/contrib/clickhouse/src/Formats/ParsedTemplateFormatString.h b/contrib/clickhouse/src/Formats/ParsedTemplateFormatString.h new file mode 100644 index 0000000000..5d7ee820f2 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ParsedTemplateFormatString.h @@ -0,0 +1,46 @@ +#pragma once + +#include <base/types.h> +#include <functional> +#include <optional> +#include <vector> +#include <Formats/FormatSchemaInfo.h> +#include <Formats/FormatSettings.h> + +namespace DB +{ + +class Block; +using Strings = std::vector<String>; + +struct ParsedTemplateFormatString +{ + using EscapingRule = FormatSettings::EscapingRule; + + /// Format string has syntax: "Delimiter0 ${ColumnName0:Format0} Delimiter1 ${ColumnName1:Format1} Delimiter2" + /// The following vectors is filled with corresponding values, delimiters.size() - 1 = formats.size() = format_idx_to_column_idx.size() + /// If format_idx_to_column_idx[i] has no value, then TemplateRowInputFormat will skip i-th column. + + std::vector<String> delimiters; + std::vector<EscapingRule> escaping_rules; + std::vector<std::optional<size_t>> format_idx_to_column_idx; + + /// For diagnostic info + Strings column_names; + + using ColumnIdxGetter = std::function<std::optional<size_t>(const String &)>; + + ParsedTemplateFormatString() = default; + ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); + + void parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); + + static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); + size_t columnsCount() const; + + String dump() const; + [[noreturn]] void throwInvalidFormat(const String & message, size_t column) const; +}; + +} + diff --git a/contrib/clickhouse/src/Formats/ProtobufReader.cpp b/contrib/clickhouse/src/Formats/ProtobufReader.cpp new file mode 100644 index 0000000000..577342bf29 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ProtobufReader.cpp @@ -0,0 +1,439 @@ +#include "ProtobufReader.h" + +#if USE_PROTOBUF +# include <IO/ReadHelpers.h> + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int UNKNOWN_PROTOBUF_FORMAT; +} + + +namespace +{ + enum WireType + { + VARINT = 0, + BITS64 = 1, + LENGTH_DELIMITED = 2, + GROUP_START = 3, + GROUP_END = 4, + BITS32 = 5, + }; + + // The following conditions must always be true: + // any_cursor_position > END_OF_VARINT + // any_cursor_position > END_OF_GROUP + // Those inequations helps checking conditions in ProtobufReader::SimpleReader. + constexpr Int64 END_OF_VARINT = -1; + constexpr Int64 END_OF_GROUP = -2; + constexpr Int64 END_OF_FILE = -3; + + Int64 decodeZigZag(UInt64 n) { return static_cast<Int64>((n >> 1) ^ (~(n & 1) + 1)); } +} + + +ProtobufReader::ProtobufReader(ReadBuffer & in_) + : in(&in_) +{ +} + +void ProtobufReader::startMessage(bool with_length_delimiter_) +{ + // Start reading a root message. + assert(!current_message_level); + + root_message_has_length_delimiter = with_length_delimiter_; + if (root_message_has_length_delimiter) + { + size_t size_of_message = readVarint(); + current_message_end = cursor + size_of_message; + } + else + { + current_message_end = END_OF_FILE; + } + ++current_message_level; + field_number = next_field_number = 0; + field_end = cursor; +} + +void ProtobufReader::endMessage(bool ignore_errors) +{ + if (!current_message_level) + return; + + Int64 root_message_end = (current_message_level == 1) ? current_message_end : parent_message_ends.front(); + if (cursor != root_message_end) + { + if (cursor < root_message_end) + ignore(root_message_end - cursor); + else if (root_message_end == END_OF_FILE) + ignoreAll(); + else if (ignore_errors) + moveCursorBackward(cursor - root_message_end); + else + throwUnknownFormat(); + } + + current_message_level = 0; + parent_message_ends.clear(); +} + +void ProtobufReader::startNestedMessage() +{ + assert(current_message_level >= 1); + if ((cursor > field_end) && (field_end != END_OF_GROUP)) + throwUnknownFormat(); + + // Start reading a nested message which is located inside a length-delimited field + // of another message. + parent_message_ends.emplace_back(current_message_end); + current_message_end = field_end; + ++current_message_level; + field_number = next_field_number = 0; + field_end = cursor; +} + +void ProtobufReader::endNestedMessage() +{ + assert(current_message_level >= 2); + if (cursor != current_message_end) + { + if (current_message_end == END_OF_GROUP) + { + ignoreGroup(); + current_message_end = cursor; + } + else if (cursor < current_message_end) + ignore(current_message_end - cursor); + else + throwUnknownFormat(); + } + + --current_message_level; + current_message_end = parent_message_ends.back(); + parent_message_ends.pop_back(); + field_number = next_field_number = 0; + field_end = cursor; +} + +bool ProtobufReader::readFieldNumber(int & field_number_) +{ + assert(current_message_level); + if (next_field_number) + { + field_number_ = field_number = next_field_number; + next_field_number = 0; + return true; + } + + if (field_end != cursor) + { + if (field_end == END_OF_VARINT) + { + ignoreVarint(); + field_end = cursor; + } + else if (field_end == END_OF_GROUP) + { + ignoreGroup(); + field_end = cursor; + } + else if (cursor < field_end) + ignore(field_end - cursor); + else + throwUnknownFormat(); + } + + if (cursor >= current_message_end) + { + if (current_message_end == END_OF_FILE) + { + if (unlikely(in->eof())) + { + current_message_end = cursor; + return false; + } + } + else if (current_message_end == END_OF_GROUP) + { + /// We'll check for the `GROUP_END` marker later. + } + else + return false; + } + + UInt64 varint = readVarint(); + if (unlikely(varint & (static_cast<UInt64>(0xFFFFFFFF) << 32))) + throwUnknownFormat(); + UInt32 key = static_cast<UInt32>(varint); + field_number_ = field_number = (key >> 3); + next_field_number = 0; + WireType wire_type = static_cast<WireType>(key & 0x07); + switch (wire_type) + { + case BITS32: + { + field_end = cursor + 4; + return true; + } + case BITS64: + { + field_end = cursor + 8; + return true; + } + case LENGTH_DELIMITED: + { + size_t length = readVarint(); + field_end = cursor + length; + return true; + } + case VARINT: + { + field_end = END_OF_VARINT; + return true; + } + case GROUP_START: + { + field_end = END_OF_GROUP; + return true; + } + case GROUP_END: + { + if (current_message_end != END_OF_GROUP) + throwUnknownFormat(); + current_message_end = cursor; + return false; + } + } + throwUnknownFormat(); +} + +UInt64 ProtobufReader::readUInt() +{ + UInt64 value; + if (field_end == END_OF_VARINT) + { + value = readVarint(); + field_end = cursor; + } + else + { + value = readVarint(); + if (cursor < field_end) + next_field_number = field_number; + else if (unlikely(cursor) > field_end) + throwUnknownFormat(); + } + return value; +} + +Int64 ProtobufReader::readInt() +{ + return static_cast<Int64>(readUInt()); +} + +Int64 ProtobufReader::readSInt() +{ + return decodeZigZag(readUInt()); +} + +template<typename T> +T ProtobufReader::readFixed() +{ + if (unlikely(cursor + static_cast<Int64>(sizeof(T)) > field_end)) + throwUnknownFormat(); + T value; + readBinary(&value, sizeof(T)); + if (cursor < field_end) + next_field_number = field_number; + return value; +} + +template Int32 ProtobufReader::readFixed<Int32>(); +template UInt32 ProtobufReader::readFixed<UInt32>(); +template Int64 ProtobufReader::readFixed<Int64>(); +template UInt64 ProtobufReader::readFixed<UInt64>(); +template Float32 ProtobufReader::readFixed<Float32>(); +template Float64 ProtobufReader::readFixed<Float64>(); + +void ProtobufReader::readString(String & str) +{ + if (unlikely(cursor > field_end)) + throwUnknownFormat(); + size_t length = field_end - cursor; + str.resize(length); + readBinary(reinterpret_cast<char*>(str.data()), length); +} + +void ProtobufReader::readStringAndAppend(PaddedPODArray<UInt8> & str) +{ + if (unlikely(cursor > field_end)) + throwUnknownFormat(); + size_t length = field_end - cursor; + size_t old_size = str.size(); + str.resize(old_size + length); + readBinary(reinterpret_cast<char*>(str.data() + old_size), length); +} + +void ProtobufReader::readBinary(void* data, size_t size) +{ + in->readStrict(reinterpret_cast<char*>(data), size); + cursor += size; +} + +void ProtobufReader::ignore(UInt64 num_bytes) +{ + in->ignore(num_bytes); + cursor += num_bytes; +} + +void ProtobufReader::ignoreAll() +{ + cursor += in->tryIgnore(std::numeric_limits<size_t>::max()); +} + +void ProtobufReader::moveCursorBackward(UInt64 num_bytes) +{ + if (in->offset() < num_bytes) + throwUnknownFormat(); + in->position() -= num_bytes; + cursor -= num_bytes; +} + +UInt64 ProtobufReader::continueReadingVarint(UInt64 first_byte) +{ + UInt64 result = (first_byte & ~static_cast<UInt64>(0x80)); + char c; + +# define PROTOBUF_READER_READ_VARINT_BYTE(byteNo) \ + do \ + { \ + in->readStrict(c); \ + ++cursor; \ + if constexpr ((byteNo) < 10) \ + { \ + result |= static_cast<UInt64>(static_cast<UInt8>(c)) << (7 * ((byteNo)-1)); \ + if (likely(!(c & 0x80))) \ + return result; \ + } \ + else \ + { \ + if (likely(c == 1)) \ + return result; \ + } \ + if constexpr ((byteNo) < 9) \ + result &= ~(static_cast<UInt64>(0x80) << (7 * ((byteNo)-1))); \ + } while (false) + + PROTOBUF_READER_READ_VARINT_BYTE(2); + PROTOBUF_READER_READ_VARINT_BYTE(3); + PROTOBUF_READER_READ_VARINT_BYTE(4); + PROTOBUF_READER_READ_VARINT_BYTE(5); + PROTOBUF_READER_READ_VARINT_BYTE(6); + PROTOBUF_READER_READ_VARINT_BYTE(7); + PROTOBUF_READER_READ_VARINT_BYTE(8); + PROTOBUF_READER_READ_VARINT_BYTE(9); + PROTOBUF_READER_READ_VARINT_BYTE(10); + +# undef PROTOBUF_READER_READ_VARINT_BYTE + + throwUnknownFormat(); +} + +void ProtobufReader::ignoreVarint() +{ + char c; + +# define PROTOBUF_READER_IGNORE_VARINT_BYTE(byteNo) \ + do \ + { \ + in->readStrict(c); \ + ++cursor; \ + if constexpr ((byteNo) < 10) \ + { \ + if (likely(!(c & 0x80))) \ + return; \ + } \ + else \ + { \ + if (likely(c == 1)) \ + return; \ + } \ + } while (false) + + PROTOBUF_READER_IGNORE_VARINT_BYTE(1); + PROTOBUF_READER_IGNORE_VARINT_BYTE(2); + PROTOBUF_READER_IGNORE_VARINT_BYTE(3); + PROTOBUF_READER_IGNORE_VARINT_BYTE(4); + PROTOBUF_READER_IGNORE_VARINT_BYTE(5); + PROTOBUF_READER_IGNORE_VARINT_BYTE(6); + PROTOBUF_READER_IGNORE_VARINT_BYTE(7); + PROTOBUF_READER_IGNORE_VARINT_BYTE(8); + PROTOBUF_READER_IGNORE_VARINT_BYTE(9); + PROTOBUF_READER_IGNORE_VARINT_BYTE(10); + +# undef PROTOBUF_READER_IGNORE_VARINT_BYTE + + throwUnknownFormat(); +} + +void ProtobufReader::ignoreGroup() +{ + size_t level = 1; + while (true) + { + UInt64 varint = readVarint(); + WireType wire_type = static_cast<WireType>(varint & 0x07); + switch (wire_type) + { + case VARINT: + { + ignoreVarint(); + break; + } + case BITS64: + { + ignore(8); + break; + } + case LENGTH_DELIMITED: + { + ignore(readVarint()); + break; + } + case GROUP_START: + { + ++level; + break; + } + case GROUP_END: + { + if (!--level) + return; + break; + } + case BITS32: + { + ignore(4); + break; + } + } + throwUnknownFormat(); + } +} + +[[noreturn]] void ProtobufReader::throwUnknownFormat() const +{ + throw Exception(ErrorCodes::UNKNOWN_PROTOBUF_FORMAT, "Protobuf messages are corrupted or don't match the provided schema.{}", + root_message_has_length_delimiter + ? " Please note that Protobuf stream is length-delimited: every message is prefixed by its length in varint." + : ""); +} +} + +#endif diff --git a/contrib/clickhouse/src/Formats/ProtobufReader.h b/contrib/clickhouse/src/Formats/ProtobufReader.h new file mode 100644 index 0000000000..8837b67806 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ProtobufReader.h @@ -0,0 +1,73 @@ +#pragma once + +#include "clickhouse_config.h" + +#if USE_PROTOBUF +# include <Common/PODArray.h> +# include <IO/ReadBuffer.h> + + +namespace DB +{ +class ReadBuffer; + +/// Utility class for reading in the Protobuf format. +/// Knows nothing about protobuf schemas, just provides useful functions to serialize data. +class ProtobufReader +{ +public: + explicit ProtobufReader(ReadBuffer & in_); + + void startMessage(bool with_length_delimiter_); + void endMessage(bool ignore_errors); + void startNestedMessage(); + void endNestedMessage(); + + bool readFieldNumber(int & field_number); + Int64 readInt(); + Int64 readSInt(); + UInt64 readUInt(); + template<typename T> T readFixed(); + + void readString(String & str); + void readStringAndAppend(PaddedPODArray<UInt8> & str); + + bool eof() const { return in->eof(); } + + void setReadBuffer(ReadBuffer & in_) { in = &in_; } + +private: + void readBinary(void * data, size_t size); + void ignore(UInt64 num_bytes); + void ignoreAll(); + void moveCursorBackward(UInt64 num_bytes); + + UInt64 ALWAYS_INLINE readVarint() + { + char c; + in->readStrict(c); + UInt64 first_byte = static_cast<UInt8>(c); + ++cursor; + if (likely(!(c & 0x80))) + return first_byte; + return continueReadingVarint(first_byte); + } + + UInt64 continueReadingVarint(UInt64 first_byte); + void ignoreVarint(); + void ignoreGroup(); + [[noreturn]] void throwUnknownFormat() const; + + ReadBuffer * in; + Int64 cursor = 0; + bool root_message_has_length_delimiter = false; + size_t current_message_level = 0; + Int64 current_message_end = 0; + std::vector<Int64> parent_message_ends; + int field_number = 0; + int next_field_number = 0; + Int64 field_end = 0; +}; + +} +#endif diff --git a/contrib/clickhouse/src/Formats/ProtobufSchemas.cpp b/contrib/clickhouse/src/Formats/ProtobufSchemas.cpp new file mode 100644 index 0000000000..5557d8dad1 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ProtobufSchemas.cpp @@ -0,0 +1,121 @@ +#include "clickhouse_config.h" + +#if USE_PROTOBUF +# include <Formats/FormatSchemaInfo.h> +# error #include <Formats/ProtobufSchemas.h> +# error #include <google/protobuf/compiler/importer.h> +# include <Common/Exception.h> + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int CANNOT_PARSE_PROTOBUF_SCHEMA; +} + +ProtobufSchemas & ProtobufSchemas::instance() +{ + static ProtobufSchemas instance; + return instance; +} + +class ProtobufSchemas::ImporterWithSourceTree : public google::protobuf::compiler::MultiFileErrorCollector +{ +public: + explicit ImporterWithSourceTree(const String & schema_directory, WithEnvelope with_envelope_) + : importer(&disk_source_tree, this) + , with_envelope(with_envelope_) + { + disk_source_tree.MapPath("", schema_directory); + } + + ~ImporterWithSourceTree() override = default; + + const google::protobuf::Descriptor * import(const String & schema_path, const String & message_name) + { + // Search the message type among already imported ones. + const auto * descriptor = importer.pool()->FindMessageTypeByName(message_name); + if (descriptor) + return descriptor; + + const auto * file_descriptor = importer.Import(schema_path); + if (error) + { + auto info = error.value(); + error.reset(); + throw Exception( + ErrorCodes::CANNOT_PARSE_PROTOBUF_SCHEMA, + "Cannot parse '{}' file, found an error at line {}, column {}, {}", + info.filename, + std::to_string(info.line), + std::to_string(info.column), + info.message); + } + + assert(file_descriptor); + + if (with_envelope == WithEnvelope::No) + { + const auto * message_descriptor = file_descriptor->FindMessageTypeByName(message_name); + if (!message_descriptor) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Could not find a message named '{}' in the schema file '{}'", + message_name, schema_path); + + return message_descriptor; + } + else + { + const auto * envelope_descriptor = file_descriptor->FindMessageTypeByName("Envelope"); + if (!envelope_descriptor) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Could not find a message named 'Envelope' in the schema file '{}'", + schema_path); + + const auto * message_descriptor = envelope_descriptor->FindNestedTypeByName(message_name); // silly protobuf API disallows a restricting the field type to messages + if (!message_descriptor) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Could not find a message named '{}' in the schema file '{}'", + message_name, schema_path); + + return message_descriptor; + } + } + +private: + // Overrides google::protobuf::compiler::MultiFileErrorCollector: + void AddError(const String & filename, int line, int column, const String & message) override + { + /// Protobuf library code is not exception safe, we should + /// remember the error and throw it later from our side. + error = ErrorInfo{filename, line, column, message}; + } + + google::protobuf::compiler::DiskSourceTree disk_source_tree; + google::protobuf::compiler::Importer importer; + const WithEnvelope with_envelope; + + struct ErrorInfo + { + String filename; + int line; + int column; + String message; + }; + + std::optional<ErrorInfo> error; +}; + + +const google::protobuf::Descriptor * ProtobufSchemas::getMessageTypeForFormatSchema(const FormatSchemaInfo & info, WithEnvelope with_envelope) +{ + std::lock_guard lock(mutex); + auto it = importers.find(info.schemaDirectory()); + if (it == importers.end()) + it = importers.emplace(info.schemaDirectory(), std::make_unique<ImporterWithSourceTree>(info.schemaDirectory(), with_envelope)).first; + auto * importer = it->second.get(); + return importer->import(info.schemaPath(), info.messageName()); +} + +} + +#endif diff --git a/contrib/clickhouse/src/Formats/ProtobufSerializer.cpp b/contrib/clickhouse/src/Formats/ProtobufSerializer.cpp new file mode 100644 index 0000000000..0dbed1e4b3 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ProtobufSerializer.cpp @@ -0,0 +1,3873 @@ +#include <Formats/ProtobufSerializer.h> + +#if USE_PROTOBUF +# include <Columns/ColumnAggregateFunction.h> +# include <Columns/ColumnArray.h> +# include <Columns/ColumnDecimal.h> +# include <Columns/ColumnLowCardinality.h> +# include <Columns/ColumnMap.h> +# include <Columns/ColumnNullable.h> +# include <Columns/ColumnFixedString.h> +# include <Columns/ColumnString.h> +# include <Columns/ColumnTuple.h> +# include <Columns/ColumnVector.h> +# include <Common/PODArray.h> +# include <Common/quoteString.h> +# include <Core/DecimalComparison.h> +# include <DataTypes/DataTypeAggregateFunction.h> +# include <DataTypes/DataTypeArray.h> +# include <DataTypes/DataTypesDecimal.h> +# include <DataTypes/DataTypeDateTime64.h> +# include <DataTypes/DataTypeEnum.h> +# include <DataTypes/DataTypeFixedString.h> +# include <DataTypes/DataTypeLowCardinality.h> +# include <DataTypes/DataTypeMap.h> +# include <DataTypes/DataTypeNullable.h> +# include <DataTypes/DataTypeTuple.h> +# include <DataTypes/DataTypeString.h> +# include <DataTypes/Serializations/SerializationDecimal.h> +# include <DataTypes/Serializations/SerializationFixedString.h> +# include <Formats/ProtobufReader.h> +# include <Formats/ProtobufWriter.h> +# include <Formats/RowInputMissingColumnsFiller.h> +# include <IO/Operators.h> +# include <IO/ReadBufferFromString.h> +# include <IO/ReadHelpers.h> +# include <IO/WriteBufferFromString.h> +# include <IO/WriteHelpers.h> +# include <base/range.h> +# include <base/sort.h> +# error #include <google/protobuf/descriptor.h> +# error #include <google/protobuf/descriptor.pb.h> +# include <boost/algorithm/string.hpp> +# include <boost/container/flat_map.hpp> +# include <boost/container/flat_set.hpp> +# include <boost/numeric/conversion/cast.hpp> +# include <boost/range/algorithm.hpp> +# include <boost/range/algorithm_ext/erase.hpp> +# include <Common/logger_useful.h> + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS; + extern const int MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD; + extern const int NO_COLUMN_SERIALIZED_TO_REQUIRED_PROTOBUF_FIELD; + extern const int DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD; + extern const int PROTOBUF_FIELD_NOT_REPEATED; + extern const int PROTOBUF_BAD_CAST; + extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + using FieldDescriptor = google::protobuf::FieldDescriptor; + using MessageDescriptor = google::protobuf::Descriptor; + using FieldTypeId = google::protobuf::FieldDescriptor::Type; + + + /// Compares column's name with protobuf field's name. + /// This comparison is case-insensitive and ignores the difference between '.' and '_' + struct ColumnNameWithProtobufFieldNameComparator + { + static bool equals(char c1, char c2) + { + return convertChar(c1) == convertChar(c2); + } + + static bool equals(std::string_view s1, std::string_view s2) + { + return (s1.length() == s2.length()) + && std::equal(s1.begin(), s1.end(), s2.begin(), [](char c1, char c2) { return convertChar(c1) == convertChar(c2); }); + } + + static bool less(std::string_view s1, std::string_view s2) + { + return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(), [](char c1, char c2) { return convertChar(c1) < convertChar(c2); }); + } + + static bool startsWith(std::string_view s1, std::string_view s2) + { + return (s1.length() >= s2.length()) && equals(s1.substr(0, s2.length()), s2); + } + + static char convertChar(char c) + { + c = tolower(c); + if (c == '.') + c = '_'; + return c; + } + }; + + bool isGoogleWrapperMessage(const MessageDescriptor & message_descriptor) + { + auto message_type = message_descriptor.well_known_type(); + return (message_type >= google::protobuf::Descriptor::WELLKNOWNTYPE_DOUBLEVALUE) + && (message_type <= google::protobuf::Descriptor::WELLKNOWNTYPE_BOOLVALUE); + } + + bool isGoogleWrapperField(const FieldDescriptor & field_descriptor) + { + const auto * message_descriptor = field_descriptor.message_type(); + if (message_descriptor == nullptr) + return false; + return isGoogleWrapperMessage(*message_descriptor); + } + + bool isGoogleWrapperField(const FieldDescriptor * field_descriptor) + { + if (field_descriptor == nullptr) + return false; + return isGoogleWrapperField(*field_descriptor); + } + + std::string_view googleWrapperColumnName(const FieldDescriptor & field_descriptor) + { + assert(isGoogleWrapperField(field_descriptor)); + return field_descriptor.message_type()->field(0)->name(); + } + + // Should we omit null values (zero for numbers / empty string for strings) while storing them. + bool shouldSkipZeroOrEmpty(const FieldDescriptor & field_descriptor, bool google_wrappers_special_treatment = false) + { + if (!field_descriptor.is_optional()) + return false; + if (field_descriptor.containing_type()->options().map_entry()) + return false; + if (google_wrappers_special_treatment && isGoogleWrapperField(field_descriptor)) + return false; + return field_descriptor.message_type() || (field_descriptor.file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO3); + } + + // Should we pack repeated values while storing them. + bool shouldPackRepeated(const FieldDescriptor & field_descriptor) + { + if (!field_descriptor.is_repeated()) + return false; + switch (field_descriptor.type()) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + case FieldTypeId::TYPE_BOOL: + case FieldTypeId::TYPE_ENUM: + break; + default: + return false; + } + if (field_descriptor.options().has_packed()) + return field_descriptor.options().packed(); + return field_descriptor.file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO3; + } + + WriteBuffer & writeIndent(WriteBuffer & out, size_t size) { return out << String(size * 4, ' '); } + + + [[noreturn]] void wrongNumberOfColumns(size_t number_of_columns, const String & expected) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong number of columns: expected {}, specified {}", expected, number_of_columns); + } + + + struct ProtobufReaderOrWriter + { + ProtobufReaderOrWriter(ProtobufReader & reader_) : reader(&reader_) {} // NOLINT(google-explicit-constructor) + ProtobufReaderOrWriter(ProtobufWriter & writer_) : writer(&writer_) {} // NOLINT(google-explicit-constructor) + ProtobufReader * const reader = nullptr; + ProtobufWriter * const writer = nullptr; + }; + + + /// Base class for all serializers which serialize a single value. + class ProtobufSerializerSingleValue : public ProtobufSerializer + { + protected: + ProtobufSerializerSingleValue( + std::string_view column_name_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : column_name(column_name_) + , field_descriptor(field_descriptor_) + , field_typeid(field_descriptor_.type()) + , field_tag(field_descriptor.number()) + , reader(reader_or_writer_.reader) + , writer(reader_or_writer_.writer) + , skip_zero_or_empty(shouldSkipZeroOrEmpty(field_descriptor)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + column = columns[0]; + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + column = columns[0]->getPtr(); + } + + template <typename NumberType> + void writeInt(NumberType value) + { + auto casted = castNumber<Int64>(value); + if (casted != 0 || !skip_zero_or_empty) + writer->writeInt(field_tag, casted); + } + + template <typename NumberType> + void writeSInt(NumberType value) + { + auto casted = castNumber<Int64>(value); + if (casted != 0 || !skip_zero_or_empty) + writer->writeSInt(field_tag, casted); + } + + template <typename NumberType> + void writeUInt(NumberType value) + { + auto casted = castNumber<UInt64>(value); + if (casted != 0 || !skip_zero_or_empty) + writer->writeUInt(field_tag, casted); + } + + template <typename FieldType, typename NumberType> + void writeFixed(NumberType value) + { + auto casted = castNumber<FieldType>(value); + if (casted != 0 || !skip_zero_or_empty) + writer->writeFixed(field_tag, casted); + } + + Int64 readInt() { return reader->readInt(); } + Int64 readSInt() { return reader->readSInt(); } + UInt64 readUInt() { return reader->readUInt(); } + + template <typename FieldType> + FieldType readFixed() + { + return reader->readFixed<FieldType>(); + } + + void writeStr(std::string_view str) + { + if (!str.empty() || !skip_zero_or_empty) + writer->writeString(field_tag, str); + } + + void readStr(String & str) { reader->readString(str); } + void readStrAndAppend(PaddedPODArray<UInt8> & str) { reader->readStringAndAppend(str); } + + template <typename DestType> + DestType parseFromStr(std::string_view str) const + { + try + { + DestType result; + ReadBufferFromMemory buf(str.data(), str.length()); + readText(result, buf); + return result; + } + catch (...) + { + cannotConvertValue(str, "String", TypeName<DestType>); + } + } + + template <typename DestType, typename SrcType> + DestType castNumber(SrcType value) const + { + if constexpr (std::is_same_v<DestType, SrcType>) + return value; + DestType result; + try + { + /// TODO: use accurate::convertNumeric() maybe? + if constexpr (std::is_same_v<SrcType, IPv4>) + result = boost::numeric_cast<DestType>(value.toUnderType()); + else + result = boost::numeric_cast<DestType>(value); + } + catch (boost::numeric::bad_numeric_cast &) + { + cannotConvertValue(toString(value), TypeName<SrcType>, TypeName<DestType>); + } + return result; + } + + [[noreturn]] void incompatibleColumnType(std::string_view column_type) const + { + throw Exception( + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD, + "The column {} ({}) cannot be serialized to the field {} ({}) due to their types are not compatible", + quoteString(column_name), + column_type, + quoteString(field_descriptor.full_name()), + field_descriptor.type_name()); + } + + [[noreturn]] void cannotConvertValue(std::string_view src_value, std::string_view src_type_name, std::string_view dest_type_name) const + { + throw Exception(ErrorCodes::PROTOBUF_BAD_CAST, + "Could not convert value '{}' from type {} to type {} while {} field {} {} column {}", + String{src_value}, String{src_type_name}, String{dest_type_name}, + (reader ? "reading" : "writing"), quoteString(field_descriptor.name()), + (reader ? "for inserting into" : "extracted from"), quoteString(column_name)); + } + + const String column_name; + const FieldDescriptor & field_descriptor; + const FieldTypeId field_typeid; + const int field_tag; + ProtobufReader * const reader; + ProtobufWriter * const writer; + ColumnPtr column; + + private: + const bool skip_zero_or_empty; + }; + + + /// Serializes any ColumnVector<NumberType> to a field of any type except TYPE_MESSAGE, TYPE_GROUP. + /// NumberType must be one of the following types: Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, + /// Int128, UInt128, Int256, UInt256, Float32, Float64. + /// And the field's type cannot be TYPE_ENUM if NumberType is Float32 or Float64. + template <typename NumberType> + class ProtobufSerializerNumber : public ProtobufSerializerSingleValue + { + public: + using ColumnType = ColumnVector<NumberType>; + + ProtobufSerializerNumber(std::string_view column_name_, const FieldDescriptor & field_descriptor_, const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(column_name_, field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + void writeRow(size_t row_num) override + { + const auto & column_vector = assert_cast<const ColumnType &>(*column); + write_function(column_vector.getElement(row_num)); + } + + void readRow(size_t row_num) override + { + NumberType value = read_function(); + auto & column_vector = assert_cast<ColumnType &>(column->assumeMutableRef()); + if (row_num < column_vector.size()) + column_vector.getElement(row_num) = value; + else + column_vector.insertValue(value); + } + + void insertDefaults(size_t row_num) override + { + auto & column_vector = assert_cast<ColumnType &>(column->assumeMutableRef()); + if (row_num < column_vector.size()) + return; + column_vector.insertValue(getDefaultNumber()); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerNumber<" << TypeName<NumberType> << ">: column " << quoteString(column_name) + << " -> field " << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() + << ")\n"; + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + { + write_function = [this](NumberType value) { writeInt(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readInt()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_SINT32: + { + write_function = [this](NumberType value) { writeSInt(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readSInt()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_UINT32: + { + write_function = [this](NumberType value) { writeUInt(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readUInt()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_INT64: + { + write_function = [this](NumberType value) { writeInt(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readInt()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_SINT64: + { + write_function = [this](NumberType value) { writeSInt(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readSInt()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_UINT64: + { + write_function = [this](NumberType value) { writeUInt(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readUInt()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_FIXED32: + { + write_function = [this](NumberType value) { writeFixed<UInt32>(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readFixed<UInt32>()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED32: + { + write_function = [this](NumberType value) { writeFixed<Int32>(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readFixed<Int32>()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_FIXED64: + { + write_function = [this](NumberType value) { writeFixed<UInt64>(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readFixed<UInt64>()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED64: + { + write_function = [this](NumberType value) { writeFixed<Int64>(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readFixed<Int64>()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_FLOAT: + { + write_function = [this](NumberType value) { writeFixed<Float32>(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readFixed<Float32>()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_float()); }; + break; + } + + case FieldTypeId::TYPE_DOUBLE: + { + write_function = [this](NumberType value) { writeFixed<Float64>(value); }; + read_function = [this]() -> NumberType { return castNumber<NumberType>(readFixed<Float64>()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_double()); }; + break; + } + + case FieldTypeId::TYPE_BOOL: + { + write_function = [this](NumberType value) + { + if (value == 0) + writeUInt(0); + else if (value == 1) + writeUInt(1); + else + cannotConvertValue(toString(value), TypeName<NumberType>, field_descriptor.type_name()); + }; + + read_function = [this]() -> NumberType + { + UInt64 u64 = readUInt(); + if (u64 < 2) + return castNumber<NumberType>(u64); + else + cannotConvertValue(toString(u64), field_descriptor.type_name(), TypeName<NumberType>); + }; + + default_function = [this]() -> NumberType { return static_cast<NumberType>(field_descriptor.default_value_bool()); }; + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](NumberType value) + { + WriteBufferFromString buf{text_buffer}; + writeText(value, buf); + buf.finalize(); + writeStr(text_buffer); + }; + + read_function = [this]() -> NumberType + { + readStr(text_buffer); + return parseFromStr<NumberType>(text_buffer); + }; + + default_function = [this]() -> NumberType { return parseFromStr<NumberType>(field_descriptor.default_value_string()); }; + break; + } + + case FieldTypeId::TYPE_ENUM: + { + if (std::is_floating_point_v<NumberType>) + incompatibleColumnType(TypeName<NumberType>); + + write_function = [this](NumberType value) + { + int number = castNumber<int>(value); + checkProtobufEnumValue(number); + writeInt(number); + }; + + read_function = [this]() -> NumberType { return castNumber<NumberType>(readInt()); }; + default_function = [this]() -> NumberType { return castNumber<NumberType>(field_descriptor.default_value_enum()->number()); }; + break; + } + + default: + incompatibleColumnType(TypeName<NumberType>); + } + } + + NumberType getDefaultNumber() + { + if (!default_number) + default_number = default_function(); + return *default_number; + } + + void checkProtobufEnumValue(int value) const + { + const auto * enum_value_descriptor = field_descriptor.enum_type()->FindValueByNumber(value); + if (!enum_value_descriptor) + cannotConvertValue(toString(value), TypeName<NumberType>, field_descriptor.type_name()); + } + + protected: + std::function<void(NumberType)> write_function; + std::function<NumberType()> read_function; + std::function<NumberType()> default_function; + String text_buffer; + + private: + std::optional<NumberType> default_number; + }; + + + /// Serializes ColumnString or ColumnFixedString to a field of any type except TYPE_MESSAGE, TYPE_GROUP. + template <bool is_fixed_string> + class ProtobufSerializerString : public ProtobufSerializerSingleValue + { + public: + using ColumnType = std::conditional_t<is_fixed_string, ColumnFixedString, ColumnString>; + + ProtobufSerializerString( + std::string_view column_name_, + const std::shared_ptr<const DataTypeFixedString> & fixed_string_data_type_, + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(column_name_, field_descriptor_, reader_or_writer_) + , fixed_string_data_type(fixed_string_data_type_) + , n(fixed_string_data_type->getN()) + { + static_assert(is_fixed_string, "This constructor for FixedString only"); + setFunctions(); + prepareEnumMapping(); + } + + ProtobufSerializerString( + std::string_view column_name_, + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(column_name_, field_descriptor_, reader_or_writer_) + { + static_assert(!is_fixed_string, "This constructor for String only"); + setFunctions(); + prepareEnumMapping(); + } + + void writeRow(size_t row_num) override + { + const auto & column_string = assert_cast<const ColumnType &>(*column); + write_function(std::string_view{column_string.getDataAt(row_num)}); + } + + void readRow(size_t row_num) override + { + auto & column_string = assert_cast<ColumnType &>(column->assumeMutableRef()); + const size_t old_size = column_string.size(); + typename ColumnType::Chars & data = column_string.getChars(); + const size_t old_data_size = data.size(); + + if (row_num < old_size) + { + text_buffer.clear(); + read_function(text_buffer); + } + else + { + try + { + read_function(data); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + throw; + } + } + + if constexpr (is_fixed_string) + { + if (row_num < old_size) + { + SerializationFixedString::alignStringLength(n, text_buffer, 0); + memcpy(data.data() + row_num * n, text_buffer.data(), n); + } + else + SerializationFixedString::alignStringLength(n, data, old_data_size); + } + else + { + if (row_num < old_size) + { + if (row_num != old_size - 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replace a string in the middle of ColumnString"); + column_string.popBack(1); + } + try + { + data.push_back(0 /* terminating zero */); + column_string.getOffsets().push_back(data.size()); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + column_string.getOffsets().resize_assume_reserved(old_size); + throw; + } + } + } + + void insertDefaults(size_t row_num) override + { + auto & column_string = assert_cast<ColumnType &>(column->assumeMutableRef()); + const size_t old_size = column_string.size(); + if (row_num < old_size) + return; + + const auto & default_str = getDefaultString(); + typename ColumnType::Chars & data = column_string.getChars(); + const size_t old_data_size = data.size(); + try + { + data.insert(default_str.data(), default_str.data() + default_str.size()); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + throw; + } + + if constexpr (!is_fixed_string) + { + try + { + data.push_back(0 /* terminating zero */); + column_string.getOffsets().push_back(data.size()); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + column_string.getOffsets().resize_assume_reserved(old_size); + throw; + } + } + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerString<" << (is_fixed_string ? "fixed" : "") << ">: column " + << quoteString(column_name) << " -> field " << quoteString(field_descriptor.full_name()) << " (" + << field_descriptor.type_name() << ")\n"; + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + { + write_function = [this](std::string_view str) { writeInt(parseFromStr<Int32>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_SINT32: + { + write_function = [this](std::string_view str) { writeSInt(parseFromStr<Int32>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readSInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_UINT32: + { + write_function = [this](std::string_view str) { writeUInt(parseFromStr<UInt32>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readUInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_INT64: + { + write_function = [this](std::string_view str) { writeInt(parseFromStr<Int64>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_SINT64: + { + write_function = [this](std::string_view str) { writeSInt(parseFromStr<Int64>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readSInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_UINT64: + { + write_function = [this](std::string_view str) { writeUInt(parseFromStr<UInt64>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readUInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_FIXED32: + { + write_function = [this](std::string_view str) { writeFixed<UInt32>(parseFromStr<UInt32>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readFixed<UInt32>(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED32: + { + write_function = [this](std::string_view str) { writeFixed<Int32>(parseFromStr<Int32>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readFixed<Int32>(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_FIXED64: + { + write_function = [this](std::string_view str) { writeFixed<UInt64>(parseFromStr<UInt64>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readFixed<UInt64>(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED64: + { + write_function = [this](std::string_view str) { writeFixed<Int64>(parseFromStr<Int64>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readFixed<Int64>(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_FLOAT: + { + write_function = [this](std::string_view str) { writeFixed<Float32>(parseFromStr<Float32>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readFixed<Float32>(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_float()); }; + break; + } + + case FieldTypeId::TYPE_DOUBLE: + { + write_function = [this](std::string_view str) { writeFixed<Float64>(parseFromStr<Float64>(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { toStringAppend(readFixed<Float64>(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_double()); }; + break; + } + + case FieldTypeId::TYPE_BOOL: + { + write_function = [this](std::string_view str) + { + if (str == "true") + writeUInt(1); + else if (str == "false") + writeUInt(0); + else + cannotConvertValue(str, "String", field_descriptor.type_name()); + }; + + read_function = [this](PaddedPODArray<UInt8> & str) + { + UInt64 u64 = readUInt(); + if (u64 < 2) + { + std::string_view ref(u64 ? "true" : "false"); + str.insert(ref.data(), ref.data() + ref.length()); + } + else + cannotConvertValue(toString(u64), field_descriptor.type_name(), "String"); + }; + + default_function = [this]() -> String + { + return field_descriptor.default_value_bool() ? "true" : "false"; + }; + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](std::string_view str) { writeStr(str); }; + read_function = [this](PaddedPODArray<UInt8> & str) { readStrAndAppend(str); }; + default_function = [this]() -> String { return field_descriptor.default_value_string(); }; + break; + } + + case FieldTypeId::TYPE_ENUM: + { + write_function = [this](std::string_view str) { writeInt(stringToProtobufEnumValue(str)); }; + read_function = [this](PaddedPODArray<UInt8> & str) { protobufEnumValueToStringAppend(static_cast<int>(readInt()), str); }; + default_function = [this]() -> String { return field_descriptor.default_value_enum()->name(); }; + break; + } + + default: + this->incompatibleColumnType(is_fixed_string ? "FixedString" : "String"); + } + } + + const PaddedPODArray<UInt8> & getDefaultString() + { + if (!default_string) + { + PaddedPODArray<UInt8> arr; + auto str = default_function(); + arr.insert(str.data(), str.data() + str.size()); + if constexpr (is_fixed_string) + SerializationFixedString::alignStringLength(n, arr, 0); + default_string = std::move(arr); + } + return *default_string; + } + + template <typename NumberType> + void toStringAppend(NumberType value, PaddedPODArray<UInt8> & str) + { + WriteBufferFromVector buf{str, AppendModeTag{}}; + writeText(value, buf); + } + + void prepareEnumMapping() + { + if ((field_typeid == google::protobuf::FieldDescriptor::TYPE_ENUM) && writer) + { + const auto & enum_descriptor = *field_descriptor.enum_type(); + for (int i = 0; i != enum_descriptor.value_count(); ++i) + { + const auto & enum_value_descriptor = *enum_descriptor.value(i); + string_to_protobuf_enum_value_map.emplace(enum_value_descriptor.name(), enum_value_descriptor.number()); + } + } + } + + int stringToProtobufEnumValue(std::string_view str) const + { + auto it = string_to_protobuf_enum_value_map.find(str); + if (it == string_to_protobuf_enum_value_map.end()) + cannotConvertValue(str, "String", field_descriptor.type_name()); + return it->second; + } + + std::string_view protobufEnumValueToString(int value) const + { + const auto * enum_value_descriptor = field_descriptor.enum_type()->FindValueByNumber(value); + if (!enum_value_descriptor) + cannotConvertValue(toString(value), field_descriptor.type_name(), "String"); + return enum_value_descriptor->name(); + } + + void protobufEnumValueToStringAppend(int value, PaddedPODArray<UInt8> & str) const + { + auto name = protobufEnumValueToString(value); + str.insert(name.data(), name.data() + name.length()); + } + + const std::shared_ptr<const DataTypeFixedString> fixed_string_data_type; + const size_t n = 0; + std::function<void(std::string_view)> write_function; + std::function<void(PaddedPODArray<UInt8> &)> read_function; + std::function<String()> default_function; + std::unordered_map<std::string_view, int> string_to_protobuf_enum_value_map; + PaddedPODArray<UInt8> text_buffer; + std::optional<PaddedPODArray<UInt8>> default_string; + }; + + + /// Serializes ColumnVector<NumberType> containing enum values to a field of any type + /// except TYPE_MESSAGE, TYPE_GROUP, TYPE_FLOAT, TYPE_DOUBLE, TYPE_BOOL. + /// NumberType can be either Int8 or Int16. + template <typename NumberType> + class ProtobufSerializerEnum : public ProtobufSerializerNumber<NumberType> + { + public: + using ColumnType = ColumnVector<NumberType>; + using EnumDataType = DataTypeEnum<NumberType>; + using BaseClass = ProtobufSerializerNumber<NumberType>; + + ProtobufSerializerEnum( + std::string_view column_name_, + const std::shared_ptr<const EnumDataType> & enum_data_type_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : BaseClass(column_name_, field_descriptor_, reader_or_writer_), enum_data_type(enum_data_type_) + { + assert(enum_data_type); + setFunctions(); + prepareEnumMapping(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerEnum<" << TypeName<NumberType> << ">: column " << quoteString(this->column_name) + << " -> field " << quoteString(this->field_descriptor.full_name()) << " (" + << this->field_descriptor.type_name() << ")\n"; + } + + private: + void setFunctions() + { + switch (this->field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + { + auto base_read_function = this->read_function; + this->read_function = [this, base_read_function]() -> NumberType + { + NumberType value = base_read_function(); + checkEnumDataTypeValue(value); + return value; + }; + + auto base_default_function = this->default_function; + this->default_function = [this, base_default_function]() -> NumberType + { + auto value = base_default_function(); + checkEnumDataTypeValue(value); + return value; + }; + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + this->write_function = [this](NumberType value) + { + writeStr(enumDataTypeValueToString(value)); + }; + + this->read_function = [this]() -> NumberType + { + readStr(this->text_buffer); + return stringToEnumDataTypeValue(this->text_buffer); + }; + + this->default_function = [this]() -> NumberType + { + return stringToEnumDataTypeValue(this->field_descriptor.default_value_string()); + }; + break; + } + + case FieldTypeId::TYPE_ENUM: + { + this->write_function = [this](NumberType value) { writeInt(enumDataTypeValueToProtobufEnumValue(value)); }; + this->read_function = [this]() -> NumberType { return protobufEnumValueToEnumDataTypeValue(static_cast<NumberType>(readInt())); }; + this->default_function = [this]() -> NumberType { return protobufEnumValueToEnumDataTypeValue(this->field_descriptor.default_value_enum()->number()); }; + break; + } + + default: + this->incompatibleColumnType(enum_data_type->getName()); + } + } + + void checkEnumDataTypeValue(NumberType value) + { + enum_data_type->findByValue(value); /// Throws an exception if the value isn't defined in the DataTypeEnum. + } + + std::string_view enumDataTypeValueToString(NumberType value) const { return std::string_view{enum_data_type->getNameForValue(value)}; } + NumberType stringToEnumDataTypeValue(const String & str) const { return enum_data_type->getValue(str); } + + void prepareEnumMapping() + { + if (this->field_typeid != FieldTypeId::TYPE_ENUM) + return; + + const auto & enum_descriptor = *this->field_descriptor.enum_type(); + + /// We have two mappings: + /// enum_data_type: "string->NumberType" and protobuf_enum: string->int". + /// And here we want to make from those two mapping a new mapping "NumberType->int" (if we're writing protobuf data), + /// or "int->NumberType" (if we're reading protobuf data). + + auto add_to_mapping = [&](NumberType enum_data_type_value, int protobuf_enum_value) + { + if (this->writer) + enum_data_type_value_to_protobuf_enum_value_map.emplace(enum_data_type_value, protobuf_enum_value); + else + protobuf_enum_value_to_enum_data_type_value_map.emplace(protobuf_enum_value, enum_data_type_value); + }; + + auto iless = [](std::string_view s1, std::string_view s2) { return ColumnNameWithProtobufFieldNameComparator::less(s1, s2); }; + boost::container::flat_map<std::string_view, int, decltype(iless)> string_to_protobuf_enum_value_map; + typename decltype(string_to_protobuf_enum_value_map)::sequence_type string_to_protobuf_enum_value_seq; + for (int i : collections::range(enum_descriptor.value_count())) + string_to_protobuf_enum_value_seq.emplace_back(enum_descriptor.value(i)->name(), enum_descriptor.value(i)->number()); + string_to_protobuf_enum_value_map.adopt_sequence(std::move(string_to_protobuf_enum_value_seq)); + + std::vector<NumberType> not_found_by_name_values; + not_found_by_name_values.reserve(enum_data_type->getValues().size()); + + /// Find mapping between enum_data_type and protobuf_enum by name (case insensitively), + /// i.e. we add to the mapping + /// NumberType(enum_data_type) -> "NAME"(enum_data_type) -> + /// -> "NAME"(protobuf_enum, same name) -> int(protobuf_enum) + for (const auto & [name, value] : enum_data_type->getValues()) + { + auto it = string_to_protobuf_enum_value_map.find(name); + if (it != string_to_protobuf_enum_value_map.end()) + add_to_mapping(value, it->second); + else + not_found_by_name_values.push_back(value); + } + + if (!not_found_by_name_values.empty()) + { + /// Find mapping between two enum_data_type and protobuf_enum by value. + /// If the same value has different names in enum_data_type and protobuf_enum + /// we can still add it to our mapping, i.e. we add to the mapping + /// NumberType(enum_data_type) -> int(protobuf_enum, same value) + for (NumberType value : not_found_by_name_values) + { + if (enum_descriptor.FindValueByNumber(value)) + add_to_mapping(value, value); + } + } + + size_t num_mapped_values = this->writer ? enum_data_type_value_to_protobuf_enum_value_map.size() + : protobuf_enum_value_to_enum_data_type_value_map.size(); + + if (!num_mapped_values && !enum_data_type->getValues().empty() && enum_descriptor.value_count()) + { + throw Exception(ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD, + "Couldn't find mapping between data type {} and the enum {} in the protobuf schema", + enum_data_type->getName(), quoteString(enum_descriptor.full_name())); + } + } + + int enumDataTypeValueToProtobufEnumValue(NumberType value) const + { + auto it = enum_data_type_value_to_protobuf_enum_value_map.find(value); + if (it == enum_data_type_value_to_protobuf_enum_value_map.end()) + cannotConvertValue(toString(value), enum_data_type->getName(), this->field_descriptor.type_name()); + return it->second; + } + + NumberType protobufEnumValueToEnumDataTypeValue(int value) const + { + auto it = protobuf_enum_value_to_enum_data_type_value_map.find(value); + if (it == protobuf_enum_value_to_enum_data_type_value_map.end()) + cannotConvertValue(toString(value), this->field_descriptor.type_name(), enum_data_type->getName()); + return it->second; + } + + Int64 readInt() { return ProtobufSerializerSingleValue::readInt(); } + void writeInt(Int64 value) { ProtobufSerializerSingleValue::writeInt(value); } + void writeStr(std::string_view str) { ProtobufSerializerSingleValue::writeStr(str); } + void readStr(String & str) { ProtobufSerializerSingleValue::readStr(str); } + [[noreturn]] void cannotConvertValue(std::string_view src_value, std::string_view src_type_name, std::string_view dest_type_name) const { ProtobufSerializerSingleValue::cannotConvertValue(src_value, src_type_name, dest_type_name); } + + const std::shared_ptr<const EnumDataType> enum_data_type; + std::unordered_map<NumberType, int> enum_data_type_value_to_protobuf_enum_value_map; + std::unordered_map<int, NumberType> protobuf_enum_value_to_enum_data_type_value_map; + }; + + + /// Serializes a ColumnDecimal<DecimalType> to any field except TYPE_MESSAGE, TYPE_GROUP, TYPE_ENUM. + /// DecimalType must be one of the following types: Decimal32, Decimal64, Decimal128, Decimal256, DateTime64. + template <typename DecimalType> + class ProtobufSerializerDecimal : public ProtobufSerializerSingleValue + { + public: + using ColumnType = ColumnDecimal<DecimalType>; + + ProtobufSerializerDecimal( + std::string_view column_name_, + const DataTypeDecimalBase<DecimalType> & decimal_data_type_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(column_name_, field_descriptor_, reader_or_writer_) + , precision(decimal_data_type_.getPrecision()) + , scale(decimal_data_type_.getScale()) + { + setFunctions(); + } + + void writeRow(size_t row_num) override + { + const auto & column_decimal = assert_cast<const ColumnType &>(*column); + write_function(column_decimal.getElement(row_num)); + } + + void readRow(size_t row_num) override + { + DecimalType decimal = read_function(); + auto & column_decimal = assert_cast<ColumnType &>(column->assumeMutableRef()); + if (row_num < column_decimal.size()) + column_decimal.getElement(row_num) = decimal; + else + column_decimal.insertValue(decimal); + } + + void insertDefaults(size_t row_num) override + { + auto & column_decimal = assert_cast<ColumnType &>(column->assumeMutableRef()); + if (row_num < column_decimal.size()) + return; + column_decimal.insertValue(getDefaultDecimal()); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerDecimal<" << TypeName<DecimalType> << ">: column " << quoteString(column_name) + << " -> field " << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() + << ")\n"; + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + { + write_function = [this](const DecimalType & decimal) { writeInt(decimalToNumber<Int32>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_SINT32: + { + write_function = [this](const DecimalType & decimal) { writeSInt(decimalToNumber<Int32>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readSInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_UINT32: + { + write_function = [this](const DecimalType & decimal) { writeUInt(decimalToNumber<UInt32>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readUInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_INT64: + { + write_function = [this](const DecimalType & decimal) { writeInt(decimalToNumber<Int64>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_SINT64: + { + write_function = [this](const DecimalType & decimal) { writeSInt(decimalToNumber<Int64>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readSInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_UINT64: + { + write_function = [this](const DecimalType & decimal) { writeUInt(decimalToNumber<UInt64>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readUInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_FIXED32: + { + write_function = [this](const DecimalType & decimal) { writeFixed<UInt32>(decimalToNumber<UInt32>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed<UInt32>()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED32: + { + write_function = [this](const DecimalType & decimal) { writeFixed<Int32>(decimalToNumber<Int32>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed<Int32>()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_FIXED64: + { + write_function = [this](const DecimalType & decimal) { writeFixed<UInt64>(decimalToNumber<UInt64>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed<UInt64>()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED64: + { + write_function = [this](const DecimalType & decimal) { writeFixed<Int64>(decimalToNumber<Int64>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed<Int64>()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_FLOAT: + { + write_function = [this](const DecimalType & decimal) { writeFixed<Float32>(decimalToNumber<Float32>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed<Float32>()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_float()); }; + break; + } + + case FieldTypeId::TYPE_DOUBLE: + { + write_function = [this](const DecimalType & decimal) { writeFixed<Float64>(decimalToNumber<Float64>(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed<Float64>()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_double()); }; + break; + } + + case FieldTypeId::TYPE_BOOL: + { + if (std::is_same_v<DecimalType, DateTime64>) + incompatibleColumnType(TypeName<DecimalType>); + else + { + write_function = [this](const DecimalType & decimal) + { + if (decimal.value == 0) + writeInt(0); + else if (DecimalComparison<DecimalType, int, EqualsOp>::compare(decimal, 1, scale, 0)) + writeInt(1); + else + { + WriteBufferFromOwnString buf; + writeText(decimal, scale, buf, false); + cannotConvertValue(buf.str(), TypeName<DecimalType>, field_descriptor.type_name()); + } + }; + + read_function = [this]() -> DecimalType + { + UInt64 u64 = readUInt(); + if (u64 < 2) + return numberToDecimal(static_cast<UInt64>(u64 != 0)); + else + cannotConvertValue(toString(u64), field_descriptor.type_name(), TypeName<DecimalType>); + }; + + default_function = [this]() -> DecimalType + { + return numberToDecimal(static_cast<Int64>(field_descriptor.default_value_bool())); + }; + } + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](const DecimalType & decimal) + { + decimalToString(decimal, text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> DecimalType + { + readStr(text_buffer); + return stringToDecimal(text_buffer); + }; + + default_function = [this]() -> DecimalType { return stringToDecimal(field_descriptor.default_value_string()); }; + break; + } + + default: + incompatibleColumnType(TypeName<DecimalType>); + } + } + + DecimalType getDefaultDecimal() + { + if (!default_decimal) + default_decimal = default_function(); + return *default_decimal; + } + + template <typename NumberType> + DecimalType numberToDecimal(NumberType value) const + { + return convertToDecimal<DataTypeNumber<NumberType>, DataTypeDecimal<DecimalType>>(value, scale); + } + + template <typename NumberType> + NumberType decimalToNumber(const DecimalType & decimal) const + { + return DecimalUtils::convertTo<NumberType>(decimal, scale); + } + + void decimalToString(const DecimalType & decimal, String & str) const + { + WriteBufferFromString buf{str}; + if constexpr (std::is_same_v<DecimalType, DateTime64>) + writeDateTimeText(decimal, scale, buf); + else + writeText(decimal, scale, buf, false); + } + + DecimalType stringToDecimal(const String & str) const + { + ReadBufferFromString buf(str); + DecimalType decimal{0}; + if constexpr (std::is_same_v<DecimalType, DateTime64>) + readDateTime64Text(decimal, scale, buf); + else + SerializationDecimal<DecimalType>::readText(decimal, buf, precision, scale); + return decimal; + } + + const UInt32 precision; + const UInt32 scale; + std::function<void(const DecimalType &)> write_function; + std::function<DecimalType()> read_function; + std::function<DecimalType()> default_function; + std::optional<DecimalType> default_decimal; + String text_buffer; + }; + + using ProtobufSerializerDateTime64 = ProtobufSerializerDecimal<DateTime64>; + + + /// Serializes a ColumnVector<UInt16> containing dates to a field of any type except TYPE_MESSAGE, TYPE_GROUP, TYPE_BOOL, TYPE_ENUM. + class ProtobufSerializerDate : public ProtobufSerializerNumber<UInt16> + { + public: + ProtobufSerializerDate( + std::string_view column_name_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerNumber<UInt16>(column_name_, field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerDate: column " << quoteString(column_name) << " -> field " + << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n"; + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + break; /// already set in ProtobufSerializerNumber<UInt16>::setFunctions(). + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](UInt16 value) + { + dateToString(static_cast<DayNum>(value), text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> UInt16 + { + readStr(text_buffer); + return stringToDate(text_buffer); + }; + + default_function = [this]() -> UInt16 { return stringToDate(field_descriptor.default_value_string()); }; + break; + } + + default: + incompatibleColumnType("Date"); + } + } + + static void dateToString(DayNum date, String & str) + { + WriteBufferFromString buf{str}; + writeText(date, buf); + } + + static DayNum stringToDate(const String & str) + { + DayNum date; + ReadBufferFromString buf{str}; + readDateText(date, buf); + return date; + } + }; + + class ProtobufSerializerDate32 : public ProtobufSerializerNumber<Int32> + { + public: + ProtobufSerializerDate32( + std::string_view column_name_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerNumber<Int32>(column_name_, field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerDate32: column " << quoteString(column_name) << " -> field " + << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n"; + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + break; /// already set in ProtobufSerializerNumber<Int32>::setFunctions(). + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](Int32 value) + { + dateToString(static_cast<ExtendedDayNum>(value), text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> Int32 + { + readStr(text_buffer); + return stringToDate(text_buffer); + }; + + default_function = [this]() -> Int32 { return stringToDate(field_descriptor.default_value_string()); }; + break; + } + + default: + incompatibleColumnType("Date32"); + } + } + + static void dateToString(ExtendedDayNum date, String & str) + { + WriteBufferFromString buf{str}; + writeDateText(date, buf); + } + + static ExtendedDayNum stringToDate(const String & str) + { + ExtendedDayNum date; + ReadBufferFromString buf{str}; + readDateText(date, buf); + return date; + } + }; + + class ProtobufSerializerIPv4 : public ProtobufSerializerNumber<IPv4> + { + public: + ProtobufSerializerIPv4( + std::string_view column_name_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerNumber<IPv4>(column_name_, field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerDate: column " << quoteString(column_name) << " -> field " + << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n"; + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + break; /// already set in ProtobufSerializerNumber<IPv4>::setFunctions(). + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](IPv4 value) + { + ipv4ToString(value, text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> IPv4 + { + readStr(text_buffer); + return stringToIPv4(text_buffer); + }; + + default_function = [this]() -> IPv4 { return stringToIPv4(field_descriptor.default_value_string()); }; + break; + } + + default: + incompatibleColumnType("IPv4"); + } + } + + static void ipv4ToString(IPv4 value, String & str) + { + WriteBufferFromString buf{str}; + writeIPv4Text(value, buf); + } + + static IPv4 stringToIPv4(const String & str) + { + IPv4 value; + ReadBufferFromString buf{str}; + readIPv4Text(value, buf); + return value; + } + }; + + /// Serializes a ColumnVector<UInt32> containing datetimes to a field of any type except TYPE_MESSAGE, TYPE_GROUP, TYPE_BOOL, TYPE_ENUM. + class ProtobufSerializerDateTime : public ProtobufSerializerNumber<UInt32> + { + public: + ProtobufSerializerDateTime( + std::string_view column_name_, + const DataTypeDateTime & type, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerNumber<UInt32>(column_name_, field_descriptor_, reader_or_writer_), + date_lut(type.getTimeZone()) + { + setFunctions(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerDateTime: column " << quoteString(column_name) << " -> field " + << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n"; + } + + protected: + const DateLUTImpl & date_lut; + + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + break; /// already set in ProtobufSerializerNumber<UInt32>::setFunctions(). + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](UInt32 value) + { + dateTimeToString(value, text_buffer, date_lut); + writeStr(text_buffer); + }; + + read_function = [this]() -> UInt32 + { + readStr(text_buffer); + return static_cast<UInt32>(stringToDateTime(text_buffer, date_lut)); + }; + + default_function = [this]() -> UInt32 + { + return static_cast<UInt32>(stringToDateTime(field_descriptor.default_value_string(), date_lut)); + }; + break; + } + + default: + incompatibleColumnType("DateTime"); + } + } + + static void dateTimeToString(time_t tm, String & str, const DateLUTImpl & lut) + { + WriteBufferFromString buf{str}; + writeDateTimeText(tm, buf, lut); + } + + static time_t stringToDateTime(const String & str, const DateLUTImpl & lut) + { + ReadBufferFromString buf{str}; + time_t tm = 0; + readDateTimeText(tm, buf, lut); + if (tm < 0) + tm = 0; + return tm; + } + }; + + + /// Serializes a ColumnVector<UUID> containing UUIDs to a field of type TYPE_STRING or TYPE_BYTES. + class ProtobufSerializerUUID : public ProtobufSerializerSingleValue + { + public: + ProtobufSerializerUUID( + std::string_view column_name_, + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(column_name_, field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + void writeRow(size_t row_num) override + { + const auto & column_vector = assert_cast<const ColumnVector<UUID> &>(*column); + write_function(column_vector.getElement(row_num)); + } + + void readRow(size_t row_num) override + { + UUID value = read_function(); + auto & column_vector = assert_cast<ColumnVector<UUID> &>(column->assumeMutableRef()); + if (row_num < column_vector.size()) + column_vector.getElement(row_num) = value; + else + column_vector.insertValue(value); + } + + void insertDefaults(size_t row_num) override + { + auto & column_vector = assert_cast<ColumnVector<UUID> &>(column->assumeMutableRef()); + if (row_num < column_vector.size()) + return; + column_vector.insertDefault(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerUUID: column " << quoteString(column_name) << " -> field " + << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n"; + } + + private: + void setFunctions() + { + if ((field_typeid != FieldTypeId::TYPE_STRING) && (field_typeid != FieldTypeId::TYPE_BYTES)) + incompatibleColumnType("UUID"); + + write_function = [this](UUID value) + { + uuidToString(value, text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> UUID + { + readStr(text_buffer); + return parse<UUID>(text_buffer); + }; + + default_function = [this]() -> UUID { return parse<UUID>(field_descriptor.default_value_string()); }; + } + + static void uuidToString(const UUID & uuid, String & str) + { + WriteBufferFromString buf{str}; + writeText(uuid, buf); + } + + std::function<void(UUID)> write_function; + std::function<UUID()> read_function; + std::function<UUID()> default_function; + String text_buffer; + }; + + /// Serializes a ColumnVector<IPv6> containing IPv6s to a field of type TYPE_STRING or TYPE_BYTES. + class ProtobufSerializerIPv6 : public ProtobufSerializerSingleValue + { + public: + ProtobufSerializerIPv6( + std::string_view column_name_, + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(column_name_, field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + void writeRow(size_t row_num) override + { + const auto & column_vector = assert_cast<const ColumnVector<IPv6> &>(*column); + write_function(column_vector.getElement(row_num)); + } + + void readRow(size_t row_num) override + { + IPv6 value = read_function(); + auto & column_vector = assert_cast<ColumnVector<IPv6> &>(column->assumeMutableRef()); + if (row_num < column_vector.size()) + column_vector.getElement(row_num) = value; + else + column_vector.insertValue(value); + } + + void insertDefaults(size_t row_num) override + { + auto & column_vector = assert_cast<ColumnVector<IPv6> &>(column->assumeMutableRef()); + if (row_num < column_vector.size()) + return; + column_vector.insertDefault(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializer" << TypeName<IPv6> << ": column " << quoteString(column_name) << " -> field " + << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n"; + } + + private: + void setFunctions() + { + if ((field_typeid != FieldTypeId::TYPE_STRING) && (field_typeid != FieldTypeId::TYPE_BYTES)) + incompatibleColumnType(TypeName<IPv6>); + + write_function = [this](IPv6 value) + { + text_buffer = String(IPV6_BINARY_LENGTH, '\0'); + memcpy(text_buffer.data(), &value.toUnderType(), IPV6_BINARY_LENGTH); + writeStr(text_buffer); + }; + + read_function = [this]() -> IPv6 + { + readStr(text_buffer); + if (text_buffer.size() != IPV6_BINARY_LENGTH) + throw Exception(ErrorCodes::PROTOBUF_BAD_CAST, + "Could not convert bytes field {} to IPv6 for inserting into column {} - field size {} is not equal to IPv6 size {}", + field_descriptor.full_name(), column_name, text_buffer.size(), IPV6_BINARY_LENGTH); + IPv6 value; + memcpy(&value.toUnderType(), text_buffer.data(), IPV6_BINARY_LENGTH); + return value; + }; + + default_function = [this]() -> IPv6 { return parse<IPv6>(field_descriptor.default_value_string()); }; + } + + std::function<void(IPv6)> write_function; + std::function<IPv6()> read_function; + std::function<IPv6()> default_function; + String text_buffer; + }; + + using ProtobufSerializerInterval = ProtobufSerializerNumber<Int64>; + + + /// Serializes a ColumnAggregateFunction to a field of type TYPE_STRING or TYPE_BYTES. + class ProtobufSerializerAggregateFunction : public ProtobufSerializerSingleValue + { + public: + ProtobufSerializerAggregateFunction( + std::string_view column_name_, + const std::shared_ptr<const DataTypeAggregateFunction> & aggregate_function_data_type_, + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(column_name_, field_descriptor_, reader_or_writer_) + , aggregate_function_data_type(aggregate_function_data_type_) + , aggregate_function(aggregate_function_data_type->getFunction()) + { + if ((field_typeid != FieldTypeId::TYPE_STRING) && (field_typeid != FieldTypeId::TYPE_BYTES)) + incompatibleColumnType(aggregate_function_data_type->getName()); + } + + void writeRow(size_t row_num) override + { + const auto & column_af = assert_cast<const ColumnAggregateFunction &>(*column); + dataToString(column_af.getData()[row_num], text_buffer); + writeStr(text_buffer); + } + + void readRow(size_t row_num) override + { + auto & column_af = assert_cast<ColumnAggregateFunction &>(column->assumeMutableRef()); + Arena & arena = column_af.createOrGetArena(); + AggregateDataPtr data; + readStr(text_buffer); + data = stringToData(text_buffer, arena); + + if (row_num < column_af.size()) + { + auto * old_data = std::exchange(column_af.getData()[row_num], data); + aggregate_function->destroy(old_data); + } + else + column_af.getData().push_back(data); + } + + void insertDefaults(size_t row_num) override + { + auto & column_af = assert_cast<ColumnAggregateFunction &>(column->assumeMutableRef()); + if (row_num < column_af.size()) + return; + + Arena & arena = column_af.createOrGetArena(); + AggregateDataPtr data = stringToData(field_descriptor.default_value_string(), arena); + column_af.getData().push_back(data); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerAggregateFunction: column " << quoteString(column_name) << " -> field " + << quoteString(field_descriptor.full_name()) << " (" << field_descriptor.type_name() << ")\n"; + } + + private: + void dataToString(ConstAggregateDataPtr data, String & str) const + { + WriteBufferFromString buf{str}; + aggregate_function->serialize(data, buf); + } + + AggregateDataPtr stringToData(const String & str, Arena & arena) const + { + size_t size_of_state = aggregate_function->sizeOfData(); + AggregateDataPtr data = arena.alignedAlloc(size_of_state, aggregate_function->alignOfData()); + try + { + aggregate_function->create(data); + ReadBufferFromMemory buf(str.data(), str.length()); + aggregate_function->deserialize(data, buf, std::nullopt, &arena); + return data; + } + catch (...) + { + aggregate_function->destroy(data); + throw; + } + } + + const std::shared_ptr<const DataTypeAggregateFunction> aggregate_function_data_type; + AggregateFunctionPtr aggregate_function; + String text_buffer; + }; + + + /// Serializes a ColumnNullable. + class ProtobufSerializerNullable : public ProtobufSerializer + { + public: + explicit ProtobufSerializerNullable(std::unique_ptr<ProtobufSerializer> nested_serializer_) + : nested_serializer(std::move(nested_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + column = columns[0]; + const auto & column_nullable = assert_cast<const ColumnNullable &>(*column); + ColumnPtr nested_column = column_nullable.getNestedColumnPtr(); + nested_serializer->setColumns(&nested_column, 1); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + const auto & column_nullable = assert_cast<const ColumnNullable &>(*column); + const auto & null_map = column_nullable.getNullMapData(); + if (!null_map[row_num]) + nested_serializer->writeRow(row_num); + } + + void readRow(size_t row_num) override + { + auto & column_nullable = assert_cast<ColumnNullable &>(column->assumeMutableRef()); + auto & nested_column = column_nullable.getNestedColumn(); + auto & null_map = column_nullable.getNullMapData(); + size_t old_size = null_map.size(); + + nested_serializer->readRow(row_num); + + if (row_num < old_size) + { + null_map[row_num] = false; + } + else + { + size_t new_size = nested_column.size(); + if (new_size != old_size + 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of ColumnNullable is unexpected"); + try + { + null_map.push_back(false); + } + catch (...) + { + nested_column.popBack(1); + throw; + } + } + } + + void insertDefaults(size_t row_num) override + { + auto & column_nullable = assert_cast<ColumnNullable &>(column->assumeMutableRef()); + if (row_num < column_nullable.size()) + return; + column_nullable.insertDefault(); + } + + void insertNestedDefaults(size_t row_num) + { + auto & column_nullable = assert_cast<ColumnNullable &>(column->assumeMutableRef()); + if (row_num < column_nullable.size()) + return; + column_nullable.getNestedColumn().insertDefault(); + column_nullable.getNullMapData().push_back(0); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerNullable ->\n"; + nested_serializer->describeTree(out, indent + 1); + } + + private: + const std::unique_ptr<ProtobufSerializer> nested_serializer; + ColumnPtr column; + }; + + + /// Serializes a ColumnMap. + class ProtobufSerializerMap : public ProtobufSerializer + { + public: + explicit ProtobufSerializerMap(std::unique_ptr<ProtobufSerializer> nested_serializer_) + : nested_serializer(std::move(nested_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + const auto & column_map = assert_cast<const ColumnMap &>(*columns[0]); + ColumnPtr nested_column = column_map.getNestedColumnPtr(); + nested_serializer->setColumns(&nested_column, 1); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override { nested_serializer->writeRow(row_num); } + void readRow(size_t row_num) override { nested_serializer->readRow(row_num); } + void insertDefaults(size_t row_num) override { nested_serializer->insertDefaults(row_num); } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerMap ->\n"; + nested_serializer->describeTree(out, indent + 1); + } + + private: + const std::unique_ptr<ProtobufSerializer> nested_serializer; + }; + + + /// Serializes a ColumnLowCardinality. + class ProtobufSerializerLowCardinality : public ProtobufSerializer + { + public: + explicit ProtobufSerializerLowCardinality(std::unique_ptr<ProtobufSerializer> nested_serializer_) + : nested_serializer(std::move(nested_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + column = columns[0]; + const auto & column_lc = assert_cast<const ColumnLowCardinality &>(*column); + ColumnPtr nested_column = column_lc.getDictionary().getNestedColumn(); + nested_serializer->setColumns(&nested_column, 1); + read_value_column_set = false; + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + const auto & column_lc = assert_cast<const ColumnLowCardinality &>(*column); + size_t unique_row_number = column_lc.getIndexes().getUInt(row_num); + nested_serializer->writeRow(unique_row_number); + } + + void readRow(size_t row_num) override + { + auto & column_lc = assert_cast<ColumnLowCardinality &>(column->assumeMutableRef()); + + if (!read_value_column_set) + { + if (!read_value_column) + { + ColumnPtr nested_column = column_lc.getDictionary().getNestedColumn(); + read_value_column = nested_column->cloneEmpty(); + } + nested_serializer->setColumns(&read_value_column, 1); + read_value_column_set = true; + } + + read_value_column->popBack(read_value_column->size()); + nested_serializer->readRow(0); + + if (row_num < column_lc.size()) + { + if (row_num != column_lc.size() - 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replace an element in the middle of ColumnLowCardinality"); + column_lc.popBack(1); + } + + column_lc.insertFromFullColumn(*read_value_column, 0); + } + + void insertDefaults(size_t row_num) override + { + auto & column_lc = assert_cast<ColumnLowCardinality &>(column->assumeMutableRef()); + if (row_num < column_lc.size()) + return; + + if (!default_value_column) + { + ColumnPtr nested_column = column_lc.getDictionary().getNestedColumn(); + default_value_column = nested_column->cloneEmpty(); + nested_serializer->setColumns(&default_value_column, 1); + nested_serializer->insertDefaults(0); + read_value_column_set = false; + } + + column_lc.insertFromFullColumn(*default_value_column, 0); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerLowCardinality ->\n"; + nested_serializer->describeTree(out, indent + 1); + } + + private: + const std::unique_ptr<ProtobufSerializer> nested_serializer; + ColumnPtr column; + MutableColumnPtr read_value_column; + bool read_value_column_set = false; + MutableColumnPtr default_value_column; + }; + + + /// Serializes a ColumnArray to a repeated field. + class ProtobufSerializerArray : public ProtobufSerializer + { + public: + explicit ProtobufSerializerArray(std::unique_ptr<ProtobufSerializer> element_serializer_) + : element_serializer(std::move(element_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + column = columns[0]; + const auto & column_array = assert_cast<const ColumnArray &>(*column); + ColumnPtr data_column = column_array.getDataPtr(); + element_serializer->setColumns(&data_column, 1); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + const auto & column_array = assert_cast<const ColumnArray &>(*column); + const auto & offsets = column_array.getOffsets(); + for (size_t i : collections::range(offsets[row_num - 1], offsets[row_num])) + element_serializer->writeRow(i); + } + + void readRow(size_t row_num) override + { + auto & column_array = assert_cast<ColumnArray &>(column->assumeMutableRef()); + auto & offsets = column_array.getOffsets(); + size_t old_size = offsets.size(); + if (row_num + 1 < old_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replace an element in the middle of ColumnArray"); + auto data_column = column_array.getDataPtr(); + size_t old_data_size = data_column->size(); + + try + { + element_serializer->readRow(old_data_size); + size_t data_size = data_column->size(); + if (data_size != old_data_size + 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of ColumnArray is unexpected"); + + if (row_num < old_size) + offsets.back() = data_size; + else + offsets.push_back(data_size); + } + catch (...) + { + if (data_column->size() > old_data_size) + data_column->assumeMutableRef().popBack(data_column->size() - old_data_size); + if (offsets.size() > old_size) + column_array.getOffsetsColumn().popBack(offsets.size() - old_size); + throw; + } + } + + void insertDefaults(size_t row_num) override + { + auto & column_array = assert_cast<ColumnArray &>(column->assumeMutableRef()); + if (row_num < column_array.size()) + return; + column_array.insertDefault(); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerArray ->\n"; + element_serializer->describeTree(out, indent + 1); + } + + private: + const std::unique_ptr<ProtobufSerializer> element_serializer; + ColumnPtr column; + }; + + + /// Serializes a ColumnTuple as a repeated field (just like we serialize arrays). + class ProtobufSerializerTupleAsArray : public ProtobufSerializer + { + public: + ProtobufSerializerTupleAsArray( + std::string_view column_name_, + const std::shared_ptr<const DataTypeTuple> & tuple_data_type_, + const FieldDescriptor & field_descriptor_, + std::vector<std::unique_ptr<ProtobufSerializer>> element_serializers_) + : column_name(column_name_) + , tuple_data_type(tuple_data_type_) + , tuple_size(tuple_data_type->getElements().size()) + , field_descriptor(field_descriptor_) + , element_serializers(std::move(element_serializers_)) + { + assert(tuple_size); + assert(tuple_size == element_serializers.size()); + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + column = columns[0]; + const auto & column_tuple = assert_cast<const ColumnTuple &>(*column); + for (size_t i : collections::range(tuple_size)) + { + auto element_column = column_tuple.getColumnPtr(i); + element_serializers[i]->setColumns(&element_column, 1); + } + current_element_index = 0; + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + for (size_t i : collections::range(tuple_size)) + element_serializers[i]->writeRow(row_num); + } + + void readRow(size_t row_num) override + { + auto & column_tuple = assert_cast<ColumnTuple &>(column->assumeMutableRef()); + + size_t old_size = column_tuple.size(); + if (row_num >= old_size) + current_element_index = 0; + + insertDefaults(row_num); + + if (current_element_index >= tuple_size) + { + throw Exception( + ErrorCodes::PROTOBUF_BAD_CAST, + "Column {}: More than {} elements was read from the field {} to fit in the data type {}", + quoteString(column_name), + tuple_size, + quoteString(field_descriptor.full_name()), + tuple_data_type->getName()); + } + + element_serializers[current_element_index]->readRow(row_num); + ++current_element_index; + } + + void insertDefaults(size_t row_num) override + { + auto & column_tuple = assert_cast<ColumnTuple &>(column->assumeMutableRef()); + size_t old_size = column_tuple.size(); + + if (row_num > old_size) + return; + + try + { + for (size_t i : collections::range(tuple_size)) + element_serializers[i]->insertDefaults(row_num); + } + catch (...) + { + for (size_t i : collections::range(tuple_size)) + { + auto element_column = column_tuple.getColumnPtr(i)->assumeMutable(); + if (element_column->size() > old_size) + element_column->popBack(element_column->size() - old_size); + } + throw; + } + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerTupleAsArray: column " << quoteString(column_name) << " (" + << tuple_data_type->getName() << ") -> field " << quoteString(field_descriptor.full_name()) << " (" + << field_descriptor.type_name() << ") ->\n"; + for (const auto & element_serializer : element_serializers) + element_serializer->describeTree(out, indent + 1); + } + + private: + const String column_name; + const std::shared_ptr<const DataTypeTuple> tuple_data_type; + const size_t tuple_size; + const FieldDescriptor & field_descriptor; + const std::vector<std::unique_ptr<ProtobufSerializer>> element_serializers; + ColumnPtr column; + size_t current_element_index = 0; + }; + + + /// Serializes a message (root or nested) in the protobuf schema. + class ProtobufSerializerMessage : public ProtobufSerializer + { + public: + struct FieldDesc + { + std::vector<size_t> column_indices; + const FieldDescriptor * field_descriptor; + std::unique_ptr<ProtobufSerializer> field_serializer; + }; + + ProtobufSerializerMessage( + std::vector<FieldDesc> && field_descs_, + const FieldDescriptor * parent_field_descriptor_, + bool with_length_delimiter_, + bool google_wrappers_special_treatment_, + std::unique_ptr<RowInputMissingColumnsFiller> missing_columns_filler_, + const ProtobufReaderOrWriter & reader_or_writer_) + : parent_field_descriptor(parent_field_descriptor_) + , with_length_delimiter(with_length_delimiter_) + , google_wrappers_special_treatment(google_wrappers_special_treatment_) + , missing_columns_filler(std::move(missing_columns_filler_)) + , should_skip_if_empty(parent_field_descriptor + ? shouldSkipZeroOrEmpty(*parent_field_descriptor, google_wrappers_special_treatment_) : false) + , reader(reader_or_writer_.reader) + , writer(reader_or_writer_.writer) + { + field_infos.reserve(field_descs_.size()); + for (auto & desc : field_descs_) + field_infos.emplace_back(std::move(desc.column_indices), *desc.field_descriptor, std::move(desc.field_serializer)); + + ::sort(field_infos.begin(), field_infos.end(), + [](const FieldInfo & lhs, const FieldInfo & rhs) { return lhs.field_tag < rhs.field_tag; }); + + for (size_t i : collections::range(field_infos.size())) + field_index_by_field_tag.emplace(field_infos[i].field_tag, i); + } + + void setHasEnvelopeAsParent() + { + has_envelope_as_parent = true; + } + + void setColumns(const ColumnPtr * columns_, size_t num_columns_) override + { + if (!num_columns_) + wrongNumberOfColumns(num_columns_, ">0"); + + std::vector<ColumnPtr> field_columns; + for (const FieldInfo & info : field_infos) + { + field_columns.clear(); + field_columns.reserve(info.column_indices.size()); + for (size_t column_index : info.column_indices) + { + if (column_index >= num_columns_) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong column index {}, expected column indices <{}", column_index, num_columns_); + field_columns.emplace_back(columns_[column_index]); + } + info.field_serializer->setColumns(field_columns.data(), field_columns.size()); + } + + if (reader || (google_wrappers_special_treatment && isGoogleWrapperField(parent_field_descriptor))) + { + mutable_columns.resize(num_columns_); + for (size_t i : collections::range(num_columns_)) + mutable_columns[i] = columns_[i]->assumeMutable(); + + std::vector<UInt8> column_is_missing; + column_is_missing.resize(num_columns_, true); + for (const FieldInfo & info : field_infos) + for (size_t i : info.column_indices) + column_is_missing[i] = false; + + has_missing_columns = (std::find(column_is_missing.begin(), column_is_missing.end(), true) != column_is_missing.end()); + } + } + + void setColumns(const MutableColumnPtr * columns_, size_t num_columns_) override + { + Columns cols; + cols.reserve(num_columns_); + for (size_t i : collections::range(num_columns_)) + cols.push_back(columns_[i]->getPtr()); + setColumns(cols.data(), cols.size()); + } + + void writeRow(size_t row_num) override + { + if (parent_field_descriptor || has_envelope_as_parent) + writer->startNestedMessage(); + else + writer->startMessage(); + + for (const FieldInfo & info : field_infos) + { + if (info.should_pack_repeated) + writer->startRepeatedPack(); + info.field_serializer->writeRow(row_num); + if (info.should_pack_repeated) + writer->endRepeatedPack(info.field_tag, true); + } + + if (parent_field_descriptor) + { + bool is_group = (parent_field_descriptor->type() == FieldTypeId::TYPE_GROUP); + writer->endNestedMessage(parent_field_descriptor->number(), is_group, + should_skip_if_empty || (google_wrappers_special_treatment && isNullGoogleWrapper(row_num))); + } + else if (has_envelope_as_parent) + { + writer->endNestedMessage(1, false, should_skip_if_empty); + } + else + writer->endMessage(with_length_delimiter); + } + + void readRow(size_t row_num) override + { + if (parent_field_descriptor || has_envelope_as_parent) + reader->startNestedMessage(); + else + reader->startMessage(with_length_delimiter); + + if (!field_infos.empty()) + { + last_field_index = 0; + last_field_tag = field_infos[0].field_tag; + size_t old_size = mutable_columns.empty() ? 0 : mutable_columns[0]->size(); + + try + { + int field_tag; + while (reader->readFieldNumber(field_tag)) + { + size_t field_index = findFieldIndexByFieldTag(field_tag); + if (field_index == static_cast<size_t>(-1)) + continue; + auto * field_serializer = field_infos[field_index].field_serializer.get(); + field_serializer->readRow(row_num); + field_infos[field_index].field_read = true; + } + + for (auto & info : field_infos) + { + if (info.field_read) + info.field_read = false; + else + { + if (google_wrappers_special_treatment && isNullableGoogleWrapper()) + { + auto * nullable_ser = dynamic_cast<ProtobufSerializerNullable*>(info.field_serializer.get()); + nullable_ser->insertNestedDefaults(row_num); + } + else + { + info.field_serializer->insertDefaults(row_num); + } + } + } + } + catch (...) + { + for (auto & column : mutable_columns) + { + if (column->size() > old_size) + column->popBack(column->size() - old_size); + } + throw; + } + } + + if (parent_field_descriptor || has_envelope_as_parent) + reader->endNestedMessage(); + else + reader->endMessage(false); + + addDefaultsToMissingColumns(row_num); + } + + void insertDefaults(size_t row_num) override + { + for (const FieldInfo & info : field_infos) + info.field_serializer->insertDefaults(row_num); + addDefaultsToMissingColumns(row_num); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + size_t num_columns = 0; + for (const auto & field_info : field_infos) + num_columns += field_info.column_indices.size(); + + writeIndent(out, indent) << "ProtobufSerializerMessage: " << num_columns << " columns ->"; + if (parent_field_descriptor) + out << " field " << quoteString(parent_field_descriptor->full_name()) << " (" << parent_field_descriptor->type_name() << ")"; + + for (const auto & field_info : field_infos) + { + out << "\n"; + writeIndent(out, indent + 1) << "Columns #"; + for (size_t j = 0; j != field_info.column_indices.size(); ++j) + { + if (j) + out << ", "; + out << field_info.column_indices[j]; + } + out << " ->\n"; + field_info.field_serializer->describeTree(out, indent + 2); + } + } + + private: + size_t findFieldIndexByFieldTag(int field_tag) + { + while (true) + { + if (field_tag == last_field_tag) + return last_field_index; + if (field_tag < last_field_tag) + break; + if (++last_field_index >= field_infos.size()) + break; + last_field_tag = field_infos[last_field_index].field_tag; + } + last_field_tag = field_tag; + auto it = field_index_by_field_tag.find(field_tag); + if (it == field_index_by_field_tag.end()) + last_field_index = static_cast<size_t>(-1); + else + last_field_index = it->second; + return last_field_index; + } + + void addDefaultsToMissingColumns(size_t row_num) + { + if (has_missing_columns) + missing_columns_filler->addDefaults(mutable_columns, row_num); + } + + bool isNullGoogleWrapper(size_t row_num) + { + return isGoogleWrapperField(parent_field_descriptor) && mutable_columns[0].get()->isNullAt(row_num); + } + + bool isNullableGoogleWrapper() + { + return isGoogleWrapperField(parent_field_descriptor) && mutable_columns[0].get()->isNullable(); + } + + struct FieldInfo + { + FieldInfo( + std::vector<size_t> && column_indices_, + const FieldDescriptor & field_descriptor_, + std::unique_ptr<ProtobufSerializer> field_serializer_) + : column_indices(std::move(column_indices_)) + , field_descriptor(&field_descriptor_) + , field_tag(field_descriptor_.number()) + , should_pack_repeated(shouldPackRepeated(field_descriptor_)) + , field_serializer(std::move(field_serializer_)) + { + } + std::vector<size_t> column_indices; + const FieldDescriptor * field_descriptor; + int field_tag; + bool should_pack_repeated; + std::unique_ptr<ProtobufSerializer> field_serializer; + bool field_read = false; + }; + + const FieldDescriptor * const parent_field_descriptor; + bool has_envelope_as_parent = false; + const bool with_length_delimiter; + const bool google_wrappers_special_treatment; + const std::unique_ptr<RowInputMissingColumnsFiller> missing_columns_filler; + const bool should_skip_if_empty; + ProtobufReader * const reader; + ProtobufWriter * const writer; + std::vector<FieldInfo> field_infos; + std::unordered_map<int, size_t> field_index_by_field_tag; + MutableColumns mutable_columns; + bool has_missing_columns = false; + int last_field_tag = 0; + size_t last_field_index = static_cast<size_t>(-1); + }; + + /// Serializes a top-level envelope message in the protobuf schema. + /// "Envelope" means that the contained subtree of serializers is enclosed in a message just once, + /// i.e. only when the first and the last row read/write trigger a read/write of the msg header. + class ProtobufSerializerEnvelope : public ProtobufSerializer + { + public: + ProtobufSerializerEnvelope( + std::unique_ptr<ProtobufSerializerMessage>&& serializer_, + const ProtobufReaderOrWriter & reader_or_writer_) + : serializer(std::move(serializer_)) + , reader(reader_or_writer_.reader) + , writer(reader_or_writer_.writer) + { + // The inner serializer has a backreference of type protobuf::FieldDescriptor * to it's parent + // serializer. If it is unset, it considers itself the top-level message, otherwise a nested + // message and accordingly it makes start/endMessage() vs. startEndNestedMessage() calls into + // Protobuf(Writer|Reader). There is no field descriptor because Envelopes merely forward calls + // but don't contain data to be serialized. We must still force the inner serializer to act + // as nested message. + serializer->setHasEnvelopeAsParent(); + } + + void setColumns(const ColumnPtr * columns_, size_t num_columns_) override + { + serializer->setColumns(columns_, num_columns_); + } + + void setColumns(const MutableColumnPtr * columns_, size_t num_columns_) override + { + serializer->setColumns(columns_, num_columns_); + } + + void writeRow(size_t row_num) override + { + if (first_call_of_write_row) + { + writer->startMessage(); + first_call_of_write_row = false; + } + + serializer->writeRow(row_num); + } + + void finalizeWrite() override + { + writer->endMessage(/*with_length_delimiter = */ true); + } + + void reset() override + { + first_call_of_write_row = true; + } + + void readRow(size_t row_num) override + { + if (first_call_of_read_row) + { + reader->startMessage(/*with_length_delimiter = */ true); + first_call_of_read_row = false; + } + + int field_tag; + [[maybe_unused]] bool ret = reader->readFieldNumber(field_tag); + assert(ret); + + serializer->readRow(row_num); + } + + void insertDefaults(size_t row_num) override + { + serializer->insertDefaults(row_num); + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerEnvelope ->\n"; + serializer->describeTree(out, indent + 1); + } + + std::unique_ptr<ProtobufSerializerMessage> serializer; + ProtobufReader * const reader; + ProtobufWriter * const writer; + bool first_call_of_write_row = true; + bool first_call_of_read_row = true; + }; + + /// Serializes a tuple with explicit names as a nested message. + class ProtobufSerializerTupleAsNestedMessage : public ProtobufSerializer + { + public: + explicit ProtobufSerializerTupleAsNestedMessage(std::unique_ptr<ProtobufSerializerMessage> message_serializer_) + : message_serializer(std::move(message_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + const auto & column_tuple = assert_cast<const ColumnTuple &>(*columns[0]); + size_t tuple_size = column_tuple.tupleSize(); + assert(tuple_size); + Columns element_columns; + element_columns.reserve(tuple_size); + for (size_t i : collections::range(tuple_size)) + element_columns.emplace_back(column_tuple.getColumnPtr(i)); + message_serializer->setColumns(element_columns.data(), element_columns.size()); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + if (num_columns != 1) + wrongNumberOfColumns(num_columns, "1"); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override { message_serializer->writeRow(row_num); } + void readRow(size_t row_num) override { message_serializer->readRow(row_num); } + void insertDefaults(size_t row_num) override { message_serializer->insertDefaults(row_num); } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerTupleAsNestedMessage ->\n"; + message_serializer->describeTree(out, indent + 1); + } + + private: + const std::unique_ptr<ProtobufSerializerMessage> message_serializer; + }; + + + /// Serializes a flattened Nested data type (an array of tuples with explicit names) + /// as a repeated nested message. + class ProtobufSerializerFlattenedNestedAsArrayOfNestedMessages : public ProtobufSerializer + { + public: + explicit ProtobufSerializerFlattenedNestedAsArrayOfNestedMessages( + const std::vector<std::string_view> & column_names_, + const FieldDescriptor * parent_field_descriptor_, + std::unique_ptr<ProtobufSerializerMessage> message_serializer_, + const std::function<String(size_t)> & get_root_desc_function_) + : parent_field_descriptor(parent_field_descriptor_) + , message_serializer(std::move(message_serializer_)) + , get_root_desc_function(get_root_desc_function_) + { + column_names.reserve(column_names_.size()); + for (const auto & column_name : column_names_) + column_names.emplace_back(column_name); + } + + void setColumns(const ColumnPtr * columns, size_t num_columns) override + { + if (!num_columns) + wrongNumberOfColumns(num_columns, ">0"); + data_columns.clear(); + data_columns.reserve(num_columns); + offset_columns.clear(); + offset_columns.reserve(num_columns); + + for (size_t i : collections::range(num_columns)) + { + const auto & column_array = assert_cast<const ColumnArray &>(*columns[i]); + data_columns.emplace_back(column_array.getDataPtr()); + + auto offset_column = column_array.getOffsetsPtr(); + if (std::binary_search(offset_columns.begin(), offset_columns.end(), offset_column)) + continue; + + /// Keep `offset_columns` sorted. + offset_columns.insert(std::upper_bound(offset_columns.begin(), offset_columns.end(), offset_column), offset_column); + + /// All the columns listed in `offset_columns` should have equal offsets. + if (i >= 1) + { + const auto & column_array0 = assert_cast<const ColumnArray &>(*columns[0]); + if (!column_array0.hasEqualOffsets(column_array)) + { + throw Exception(ErrorCodes::PROTOBUF_BAD_CAST, + "Column #{} {} and column #{} {} are supposed " + "to have equal offsets according to the following serialization tree:\n{}", + 0, quoteString(column_names[0]), + i, quoteString(column_names[i]), get_root_desc_function(0)); + } + } + } + + message_serializer->setColumns(data_columns.data(), data_columns.size()); + } + + void setColumns(const MutableColumnPtr * columns, size_t num_columns) override + { + Columns cols; + cols.reserve(num_columns); + for (size_t i : collections::range(num_columns)) + cols.push_back(columns[i]->getPtr()); + setColumns(cols.data(), cols.size()); + } + + void writeRow(size_t row_num) override + { + const auto & offset_column0 = assert_cast<const ColumnArray::ColumnOffsets &>(*offset_columns[0]); + size_t start_offset = offset_column0.getElement(row_num - 1); + size_t end_offset = offset_column0.getElement(row_num); + for (size_t i : collections::range(start_offset, end_offset)) + message_serializer->writeRow(i); + } + + void readRow(size_t row_num) override + { + size_t old_size = offset_columns[0]->size(); + if (row_num + 1 < old_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replace an element in the middle of ColumnArray"); + + size_t old_data_size = data_columns[0]->size(); + + try + { + message_serializer->readRow(old_data_size); + size_t data_size = data_columns[0]->size(); + if (data_size != old_data_size + 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected number of elements of ColumnArray has been read"); + + if (row_num < old_size) + { + for (auto & offset_column : offset_columns) + assert_cast<ColumnArray::ColumnOffsets &>(offset_column->assumeMutableRef()).getData().back() = data_size; + } + else + { + for (auto & offset_column : offset_columns) + assert_cast<ColumnArray::ColumnOffsets &>(offset_column->assumeMutableRef()).getData().push_back(data_size); + } + } + catch (...) + { + for (auto & data_column : data_columns) + { + if (data_column->size() > old_data_size) + data_column->assumeMutableRef().popBack(data_column->size() - old_data_size); + } + for (auto & offset_column : offset_columns) + { + if (offset_column->size() > old_size) + offset_column->assumeMutableRef().popBack(offset_column->size() - old_size); + } + throw; + } + } + + void insertDefaults(size_t row_num) override + { + size_t old_size = offset_columns[0]->size(); + if (row_num < old_size) + return; + + try + { + size_t data_size = data_columns[0]->size(); + for (auto & offset_column : offset_columns) + assert_cast<ColumnArray::ColumnOffsets &>(offset_column->assumeMutableRef()).getData().push_back(data_size); + } + catch (...) + { + for (auto & offset_column : offset_columns) + { + if (offset_column->size() > old_size) + offset_column->assumeMutableRef().popBack(offset_column->size() - old_size); + } + throw; + } + } + + void describeTree(WriteBuffer & out, size_t indent) const override + { + writeIndent(out, indent) << "ProtobufSerializerFlattenedNestedAsArrayOfNestedMessages: columns "; + for (size_t i = 0; i != column_names.size(); ++i) + { + if (i) + out << ", "; + out << "#" << i << " " << quoteString(column_names[i]); + } + out << " ->"; + if (parent_field_descriptor) + out << " field " << quoteString(parent_field_descriptor->full_name()) << " (" << parent_field_descriptor->type_name() << ") ->\n"; + message_serializer->describeTree(out, indent + 1); + } + + private: + Strings column_names; + const FieldDescriptor * parent_field_descriptor; + const std::unique_ptr<ProtobufSerializerMessage> message_serializer; + const std::function<String(size_t)> get_root_desc_function; + Columns data_columns; + Columns offset_columns; + }; + + + /// Produces a tree of ProtobufSerializers which serializes a row as a protobuf message. + class ProtobufSerializerBuilder + { + public: + explicit ProtobufSerializerBuilder(const ProtobufReaderOrWriter & reader_or_writer_) : reader_or_writer(reader_or_writer_) {} + + std::unique_ptr<ProtobufSerializer> buildMessageSerializer( + const Strings & column_names, + const DataTypes & data_types, + std::vector<size_t> & missing_column_indices, + const MessageDescriptor & message_descriptor, + bool with_length_delimiter, + bool with_envelope, + bool google_wrappers_special_treatment) + { + root_serializer_ptr = std::make_shared<ProtobufSerializer *>(); + get_root_desc_function = [my_root_serializer_ptr = root_serializer_ptr](size_t indent) -> String + { + WriteBufferFromOwnString buf; + (*my_root_serializer_ptr)->describeTree(buf, indent); + return buf.str(); + }; + + std::vector<size_t> used_column_indices; + auto message_serializer = buildMessageSerializerImpl( + /* num_columns = */ column_names.size(), + column_names.data(), + data_types.data(), + message_descriptor, + with_length_delimiter, + google_wrappers_special_treatment, + /* parent_field_descriptor = */ nullptr, + used_column_indices, + /* columns_are_reordered_outside = */ false, + /* check_nested_while_filling_missing_columns = */ true); + + if (!message_serializer) + { + throw Exception(ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS, + "Not found matches between the names of the columns ({}) and the fields ({}) of the message {} in the protobuf schema", + boost::algorithm::join(column_names, ", "), boost::algorithm::join(getFieldNames(message_descriptor), ", "), + quoteString(message_descriptor.full_name())); + } + + missing_column_indices.clear(); + missing_column_indices.reserve(column_names.size() - used_column_indices.size()); + auto used_column_indices_sorted = std::move(used_column_indices); + ::sort(used_column_indices_sorted.begin(), used_column_indices_sorted.end()); + boost::range::set_difference(collections::range(column_names.size()), used_column_indices_sorted, + std::back_inserter(missing_column_indices)); + + if (!with_envelope) + { + *root_serializer_ptr = message_serializer.get(); +#if 0 + LOG_INFO(&Poco::Logger::get("ProtobufSerializer"), "Serialization tree:\n{}", get_root_desc_function(0)); +#endif + return message_serializer; + } + else + { + auto envelope_serializer = std::make_unique<ProtobufSerializerEnvelope>(std::move(message_serializer), reader_or_writer); + *root_serializer_ptr = envelope_serializer.get(); +#if 0 + LOG_INFO(&Poco::Logger::get("ProtobufSerializer"), "Serialization tree:\n{}", get_root_desc_function(0)); +#endif + return envelope_serializer; + } + } + + private: + /// Collects all field names from the message (used only to format error messages). + static Strings getFieldNames(const MessageDescriptor & message_descriptor) + { + Strings field_names; + field_names.reserve(message_descriptor.field_count()); + for (int i : collections::range(message_descriptor.field_count())) + field_names.emplace_back(message_descriptor.field(i)->name()); + return field_names; + } + + static bool columnNameEqualsToFieldName(std::string_view column_name, const FieldDescriptor & field_descriptor) + { + std::string_view suffix; + return columnNameStartsWithFieldName(column_name, field_descriptor, suffix) && suffix.empty(); + } + + /// Checks if a passed column's name starts with a specified field's name. + /// The function also assigns `suffix` to the rest part of the column's name + /// which doesn't match to the field's name. + /// The function requires that rest part of the column's name to be started with a dot '.' or underline '_', + /// but doesn't include those '.' or '_' characters into `suffix`. + static bool columnNameStartsWithFieldName(std::string_view column_name, const FieldDescriptor & field_descriptor, std::string_view & suffix) + { + size_t matching_length = 0; + const MessageDescriptor & containing_type = *field_descriptor.containing_type(); + if (containing_type.options().map_entry()) + { + /// Special case. Elements of the data type Map are named as "keys" and "values", + /// but they're internally named as "key" and "value" in protobuf schema. + if (field_descriptor.number() == 1) + { + if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "keys")) + matching_length = strlen("keys"); + else if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "key")) + matching_length = strlen("key"); + } + else if (field_descriptor.number() == 2) + { + if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "values")) + matching_length = strlen("values"); + else if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "value")) + matching_length = strlen("value"); + } + } + if (!matching_length && ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, field_descriptor.name())) + { + matching_length = field_descriptor.name().length(); + } + if (column_name.length() == matching_length) + return true; + if ((column_name.length() < matching_length + 2) || !field_descriptor.message_type()) + return false; + char first_char_after_matching = column_name[matching_length]; + if (!ColumnNameWithProtobufFieldNameComparator::equals(first_char_after_matching, '.')) + return false; + suffix = column_name.substr(matching_length + 1); + return true; + } + + /// Finds fields in the protobuf message which can be considered as matching + /// for a specified column's name. The found fields can be nested messages, + /// for that case suffixes are also returned. + /// This is only the first filter, buildMessageSerializerImpl() does other checks after calling this function. + static bool findFieldsByColumnName( + std::string_view column_name, + const MessageDescriptor & message_descriptor, + std::vector<std::pair<const FieldDescriptor *, std::string_view /* suffix */>> & out_field_descriptors_with_suffixes, + bool google_wrappers_special_treatment) + { + out_field_descriptors_with_suffixes.clear(); + + /// Find all fields which have the same name as column's name (case-insensitively); i.e. we're checking + /// field_name == column_name. + for (int i : collections::range(message_descriptor.field_count())) + { + const auto & field_descriptor = *message_descriptor.field(i); + if (columnNameEqualsToFieldName(column_name, field_descriptor)) + { + std::string_view suffix = + google_wrappers_special_treatment && isGoogleWrapperField(field_descriptor) + ? googleWrapperColumnName(field_descriptor) + : ""; + out_field_descriptors_with_suffixes.emplace_back(&field_descriptor, suffix); + break; + } + } + + if (!out_field_descriptors_with_suffixes.empty()) + return true; /// We have an exact match, no need to compare prefixes. + + /// Find all fields which name is used as prefix in column's name; i.e. we're checking + /// column_name == field_name + '.' + nested_message_field_name + for (int i : collections::range(message_descriptor.field_count())) + { + const auto & field_descriptor = *message_descriptor.field(i); + std::string_view suffix; + if (columnNameStartsWithFieldName(column_name, field_descriptor, suffix)) + { + out_field_descriptors_with_suffixes.emplace_back(&field_descriptor, suffix); + } + } + + /// Shorter suffixes first. + ::sort(out_field_descriptors_with_suffixes.begin(), out_field_descriptors_with_suffixes.end(), + [](const std::pair<const FieldDescriptor *, std::string_view /* suffix */> & f1, + const std::pair<const FieldDescriptor *, std::string_view /* suffix */> & f2) + { + return f1.second.length() < f2.second.length(); + }); + + return !out_field_descriptors_with_suffixes.empty(); + } + + /// Removes TypeIndex::Array from the specified vector of data types, + /// and also removes corresponding elements from two other vectors. + template <typename T1, typename T2> + static void removeNonArrayElements(DataTypes & data_types, std::vector<T1> & elements1, std::vector<T2> & elements2) + { + size_t initial_size = data_types.size(); + assert(initial_size == elements1.size() && initial_size == elements2.size()); + data_types.reserve(initial_size * 2); + elements1.reserve(initial_size * 2); + elements2.reserve(initial_size * 2); + for (size_t i : collections::range(initial_size)) + { + if (data_types[i]->getTypeId() == TypeIndex::Array) + { + data_types.push_back(std::move(data_types[i])); + elements1.push_back(std::move(elements1[i])); + elements2.push_back(std::move(elements2[i])); + } + } + data_types.erase(data_types.begin(), data_types.begin() + initial_size); + elements1.erase(elements1.begin(), elements1.begin() + initial_size); + elements2.erase(elements2.begin(), elements2.begin() + initial_size); + } + + /// Treats specified column indices as indices in another vector of column indices. + /// Useful for handling of nested messages. + static void transformColumnIndices(std::vector<size_t> & column_indices, const std::vector<size_t> & outer_indices) + { + for (size_t & idx : column_indices) + idx = outer_indices[idx]; + } + + /// Builds a serializer for a protobuf message (root or nested). + /// + /// Some of the passed columns might be skipped, the function sets `used_column_indices` to + /// the list of those columns which match any fields in the protobuf message. + /// + /// Normally `columns_are_reordered_outside` should be false - if it's false it means that + /// the used column indices will be passed to ProtobufSerializerMessage, which will write/read + /// only those columns and set the rest of columns by default. + /// Set `columns_are_reordered_outside` to true if you're going to reorder columns + /// according to `used_column_indices` returned and pass to + /// ProtobufSerializerMessage::setColumns() only the columns which are actually used. + std::unique_ptr<ProtobufSerializerMessage> buildMessageSerializerImpl( + size_t num_columns, + const String * column_names, + const DataTypePtr * data_types, + const MessageDescriptor & message_descriptor, + bool with_length_delimiter, + bool google_wrappers_special_treatment, + const FieldDescriptor * parent_field_descriptor, + std::vector<size_t> & used_column_indices, + bool columns_are_reordered_outside, + bool check_nested_while_filling_missing_columns) + { + std::vector<std::string_view> column_names_sv; + column_names_sv.reserve(num_columns); + for (size_t i = 0; i != num_columns; ++i) + column_names_sv.emplace_back(column_names[i]); + + return buildMessageSerializerImpl( + num_columns, + column_names_sv.data(), + data_types, + message_descriptor, + with_length_delimiter, + google_wrappers_special_treatment, + parent_field_descriptor, + used_column_indices, + columns_are_reordered_outside, + check_nested_while_filling_missing_columns); + } + + std::unique_ptr<ProtobufSerializerMessage> buildMessageSerializerImpl( + size_t num_columns, + const std::string_view * column_names, + const DataTypePtr * data_types, + const MessageDescriptor & message_descriptor, + bool with_length_delimiter, + bool google_wrappers_special_treatment, + const FieldDescriptor * parent_field_descriptor, + std::vector<size_t> & used_column_indices, + bool columns_are_reordered_outside, + bool check_nested_while_filling_missing_columns) + { + std::vector<ProtobufSerializerMessage::FieldDesc> field_descs; + boost::container::flat_map<const FieldDescriptor *, std::string_view> field_descriptors_in_use; + + used_column_indices.clear(); + used_column_indices.reserve(num_columns); + boost::container::flat_set<size_t> used_column_indices_sorted; + used_column_indices_sorted.reserve(num_columns); + size_t sequential_column_index = 0; + + auto add_field_serializer = [&](std::string_view column_name_, + std::vector<size_t> && column_indices_, + const FieldDescriptor & field_descriptor_, + std::unique_ptr<ProtobufSerializer> field_serializer_) + { + auto it = field_descriptors_in_use.find(&field_descriptor_); + if (it != field_descriptors_in_use.end()) + { + throw Exception(ErrorCodes::MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD, + "Multiple columns ({}, {}) cannot be serialized to a single protobuf field {}", + backQuote(StringRef{it->second}), backQuote(StringRef{column_name_}), quoteString(field_descriptor_.full_name())); + } + + used_column_indices.insert(used_column_indices.end(), column_indices_.begin(), column_indices_.end()); + used_column_indices_sorted.insert(column_indices_.begin(), column_indices_.end()); + + auto column_indices_to_pass_to_message_serializer = std::move(column_indices_); + if (columns_are_reordered_outside) + { + for (auto & index : column_indices_to_pass_to_message_serializer) + index = sequential_column_index++; + } + + field_descs.push_back({std::move(column_indices_to_pass_to_message_serializer), &field_descriptor_, std::move(field_serializer_)}); + field_descriptors_in_use.emplace(&field_descriptor_, column_name_); + }; + + std::vector<std::pair<const FieldDescriptor *, std::string_view>> field_descriptors_with_suffixes; + + /// We're going through all the passed columns. + for (size_t column_idx : collections::range(num_columns)) + { + if (used_column_indices_sorted.count(column_idx)) + continue; + + const auto & column_name = column_names[column_idx]; + const auto & data_type = data_types[column_idx]; + + if (!findFieldsByColumnName(column_name, message_descriptor, field_descriptors_with_suffixes, google_wrappers_special_treatment)) + continue; + + if ((field_descriptors_with_suffixes.size() == 1) && field_descriptors_with_suffixes[0].second.empty()) + { + /// Simple case: one column is serialized as one field. + const auto & field_descriptor = *field_descriptors_with_suffixes[0].first; + auto field_serializer = buildFieldSerializer(column_name, data_type, + field_descriptor, field_descriptor.is_repeated(), google_wrappers_special_treatment); + + if (field_serializer) + { + add_field_serializer(column_name, {column_idx}, field_descriptor, std::move(field_serializer)); + continue; + } + } + + for (const auto & [field_descriptor, suffix] : field_descriptors_with_suffixes) + { + if (!suffix.empty()) + { + /// Complex case: one or more columns are serialized as a nested message. + std::vector<size_t> nested_column_indices; + std::vector<std::string_view> nested_column_names; + nested_column_indices.reserve(num_columns - used_column_indices.size()); + nested_column_names.reserve(num_columns - used_column_indices.size()); + nested_column_indices.push_back(column_idx); + nested_column_names.push_back(suffix); + + for (size_t j : collections::range(column_idx + 1, num_columns)) + { + if (used_column_indices_sorted.count(j)) + continue; + std::string_view other_suffix; + if (!columnNameStartsWithFieldName(column_names[j], *field_descriptor, other_suffix)) + continue; + nested_column_indices.push_back(j); + nested_column_names.push_back(other_suffix); + } + + DataTypes nested_data_types; + nested_data_types.reserve(nested_column_indices.size()); + for (size_t j : nested_column_indices) + nested_data_types.push_back(data_types[j]); + + /// Now we have up to `nested_message_column_names.size()` columns + /// which can be serialized as a nested message. + + /// We will try to serialize those columns as one nested message, + /// then, if failed, as an array of nested messages (on condition if those columns are array). + bool has_fallback_to_array_of_nested_messages = false; + if (field_descriptor->is_repeated()) + { + bool has_arrays + = boost::range::find_if( + nested_data_types, [](const DataTypePtr & dt) { return (dt->getTypeId() == TypeIndex::Array); }) + != nested_data_types.end(); + if (has_arrays) + has_fallback_to_array_of_nested_messages = true; + } + + /// Try to serialize those columns as one nested message. + try + { + std::vector<size_t> used_column_indices_in_nested; + auto nested_message_serializer = buildMessageSerializerImpl( + nested_column_names.size(), + nested_column_names.data(), + nested_data_types.data(), + *field_descriptor->message_type(), + /* with_length_delimiter = */ false, + google_wrappers_special_treatment, + field_descriptor, + used_column_indices_in_nested, + /* columns_are_reordered_outside = */ true, + /* check_nested_while_filling_missing_columns = */ false); + + /// `columns_are_reordered_outside` is true because column indices are + /// going to be transformed and then written to the outer message, + /// see add_field_serializer() below. + + if (nested_message_serializer) + { + transformColumnIndices(used_column_indices_in_nested, nested_column_indices); + add_field_serializer( + column_name, + std::move(used_column_indices_in_nested), + *field_descriptor, + std::move(nested_message_serializer)); + break; + } + } + catch (Exception & e) + { + if ((e.code() != ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED) || !has_fallback_to_array_of_nested_messages) + throw; + } + + if (has_fallback_to_array_of_nested_messages) + { + /// Try to serialize those columns as an array of nested messages. + removeNonArrayElements(nested_data_types, nested_column_names, nested_column_indices); + for (DataTypePtr & dt : nested_data_types) + dt = assert_cast<const DataTypeArray &>(*dt).getNestedType(); + + std::vector<size_t> used_column_indices_in_nested; + auto nested_message_serializer = buildMessageSerializerImpl( + nested_column_names.size(), + nested_column_names.data(), + nested_data_types.data(), + *field_descriptor->message_type(), + /* with_length_delimiter = */ false, + google_wrappers_special_treatment, + field_descriptor, + used_column_indices_in_nested, + /* columns_are_reordered_outside = */ true, + /* check_nested_while_filling_missing_columns = */ false); + + /// `columns_are_reordered_outside` is true because column indices are + /// going to be transformed and then written to the outer message, + /// see add_field_serializer() below. + + if (nested_message_serializer) + { + std::vector<std::string_view> column_names_used; + column_names_used.reserve(used_column_indices_in_nested.size()); + for (size_t i : used_column_indices_in_nested) + column_names_used.emplace_back(nested_column_names[i]); + auto field_serializer = std::make_unique<ProtobufSerializerFlattenedNestedAsArrayOfNestedMessages>( + std::move(column_names_used), field_descriptor, std::move(nested_message_serializer), get_root_desc_function); + transformColumnIndices(used_column_indices_in_nested, nested_column_indices); + add_field_serializer(column_name, std::move(used_column_indices_in_nested), *field_descriptor, std::move(field_serializer)); + break; + } + } + } + } + } + + /// Check that we've found matching columns for all the required fields. + if ((message_descriptor.file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO2) + && reader_or_writer.writer) + { + for (int i : collections::range(message_descriptor.field_count())) + { + const auto & field_descriptor = *message_descriptor.field(i); + if (field_descriptor.is_required() && !field_descriptors_in_use.count(&field_descriptor)) + throw Exception(ErrorCodes::NO_COLUMN_SERIALIZED_TO_REQUIRED_PROTOBUF_FIELD, "Field {} is required to be set", + quoteString(field_descriptor.full_name())); + } + } + + if (field_descs.empty()) + return nullptr; + + std::unique_ptr<RowInputMissingColumnsFiller> missing_columns_filler; + if (reader_or_writer.reader) + { + if (check_nested_while_filling_missing_columns) + missing_columns_filler = std::make_unique<RowInputMissingColumnsFiller>(num_columns, column_names, data_types); + else + missing_columns_filler = std::make_unique<RowInputMissingColumnsFiller>(); + } + + return std::make_unique<ProtobufSerializerMessage>( + std::move(field_descs), parent_field_descriptor, with_length_delimiter, google_wrappers_special_treatment, + std::move(missing_columns_filler), reader_or_writer); + } + + /// Builds a serializer for one-to-one match: + /// one column is serialized as one field in the protobuf message. + std::unique_ptr<ProtobufSerializer> buildFieldSerializer( + std::string_view column_name, + const DataTypePtr & data_type, + const FieldDescriptor & field_descriptor, + bool allow_repeat, + bool google_wrappers_special_treatment) + { + auto data_type_id = data_type->getTypeId(); + switch (data_type_id) + { + case TypeIndex::UInt8: return std::make_unique<ProtobufSerializerNumber<UInt8>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::UInt16: return std::make_unique<ProtobufSerializerNumber<UInt16>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::UInt32: return std::make_unique<ProtobufSerializerNumber<UInt32>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::UInt64: return std::make_unique<ProtobufSerializerNumber<UInt64>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::UInt128: return std::make_unique<ProtobufSerializerNumber<UInt128>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::UInt256: return std::make_unique<ProtobufSerializerNumber<UInt256>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Int8: return std::make_unique<ProtobufSerializerNumber<Int8>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Int16: return std::make_unique<ProtobufSerializerNumber<Int16>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Date32: return std::make_unique<ProtobufSerializerDate32>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Int32: return std::make_unique<ProtobufSerializerNumber<Int32>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Int64: return std::make_unique<ProtobufSerializerNumber<Int64>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Int128: return std::make_unique<ProtobufSerializerNumber<Int128>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Int256: return std::make_unique<ProtobufSerializerNumber<Int256>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Float32: return std::make_unique<ProtobufSerializerNumber<Float32>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Float64: return std::make_unique<ProtobufSerializerNumber<Float64>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Date: return std::make_unique<ProtobufSerializerDate>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::DateTime: return std::make_unique<ProtobufSerializerDateTime>(column_name, assert_cast<const DataTypeDateTime &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::DateTime64: return std::make_unique<ProtobufSerializerDateTime64>(column_name, assert_cast<const DataTypeDateTime64 &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::String: return std::make_unique<ProtobufSerializerString<false>>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::FixedString: return std::make_unique<ProtobufSerializerString<true>>(column_name, typeid_cast<std::shared_ptr<const DataTypeFixedString>>(data_type), field_descriptor, reader_or_writer); + case TypeIndex::Enum8: return std::make_unique<ProtobufSerializerEnum<Int8>>(column_name, typeid_cast<std::shared_ptr<const DataTypeEnum8>>(data_type), field_descriptor, reader_or_writer); + case TypeIndex::Enum16: return std::make_unique<ProtobufSerializerEnum<Int16>>(column_name, typeid_cast<std::shared_ptr<const DataTypeEnum16>>(data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal32: return std::make_unique<ProtobufSerializerDecimal<Decimal32>>(column_name, assert_cast<const DataTypeDecimal<Decimal32> &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal64: return std::make_unique<ProtobufSerializerDecimal<Decimal64>>(column_name, assert_cast<const DataTypeDecimal<Decimal64> &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal128: return std::make_unique<ProtobufSerializerDecimal<Decimal128>>(column_name, assert_cast<const DataTypeDecimal<Decimal128> &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal256: return std::make_unique<ProtobufSerializerDecimal<Decimal256>>(column_name, assert_cast<const DataTypeDecimal<Decimal256> &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::UUID: return std::make_unique<ProtobufSerializerUUID>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::IPv4: return std::make_unique<ProtobufSerializerIPv4>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::IPv6: return std::make_unique<ProtobufSerializerIPv6>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::Interval: return std::make_unique<ProtobufSerializerInterval>(column_name, field_descriptor, reader_or_writer); + case TypeIndex::AggregateFunction: return std::make_unique<ProtobufSerializerAggregateFunction>(column_name, typeid_cast<std::shared_ptr<const DataTypeAggregateFunction>>(data_type), field_descriptor, reader_or_writer); + + case TypeIndex::Nullable: + { + const auto & nullable_data_type = assert_cast<const DataTypeNullable &>(*data_type); + auto nested_serializer = buildFieldSerializer(column_name, nullable_data_type.getNestedType(), + field_descriptor, allow_repeat, google_wrappers_special_treatment); + if (!nested_serializer) + return nullptr; + return std::make_unique<ProtobufSerializerNullable>(std::move(nested_serializer)); + } + + case TypeIndex::LowCardinality: + { + const auto & low_cardinality_data_type = assert_cast<const DataTypeLowCardinality &>(*data_type); + auto nested_serializer + = buildFieldSerializer(column_name, low_cardinality_data_type.getDictionaryType(), + field_descriptor, allow_repeat, google_wrappers_special_treatment); + if (!nested_serializer) + return nullptr; + return std::make_unique<ProtobufSerializerLowCardinality>(std::move(nested_serializer)); + } + + case TypeIndex::Map: + { + const auto & map_data_type = assert_cast<const DataTypeMap &>(*data_type); + auto nested_serializer = buildFieldSerializer(column_name, map_data_type.getNestedType(), + field_descriptor, allow_repeat, google_wrappers_special_treatment); + if (!nested_serializer) + return nullptr; + return std::make_unique<ProtobufSerializerMap>(std::move(nested_serializer)); + } + + case TypeIndex::Array: + { + /// Array is serialized as a repeated field. + const auto & array_data_type = assert_cast<const DataTypeArray &>(*data_type); + + if (!allow_repeat) + { + /// Case of nested Arrays. Nested Array can be a message with one repeated field. + /// For example we have an column `arr Array(Array(UInt32))` and the next proto schema: + /// message Message { + /// message NestedArray { + /// repeated uint32 nested = 2; + /// } + /// repeated NestedArray arr = 1; + /// } + if (field_descriptor.message_type() && field_descriptor.message_type()->field_count() == 1) + { + Names column_names = {field_descriptor.message_type()->field(0)->name()}; + DataTypes data_types = {data_type}; + /// Try to serialize as a nested message. + std::vector<size_t> used_column_indices; + auto message_serializer = buildMessageSerializerImpl( + 1, + column_names.data(), + data_types.data(), + *field_descriptor.message_type(), + /* with_length_delimiter = */ false, + google_wrappers_special_treatment, + &field_descriptor, + used_column_indices, + /* columns_are_reordered_outside = */ false, + /* check_nested_while_filling_missing_columns = */ false); + + if (!message_serializer) + return nullptr; + + return message_serializer; + } + + throwFieldNotRepeated(field_descriptor, column_name); + } + + auto nested_serializer = buildFieldSerializer(column_name, array_data_type.getNestedType(), field_descriptor, + /* allow_repeat = */ false, // We do our repeating now, so for nested type we forget about the repeating. + google_wrappers_special_treatment); + if (!nested_serializer) + return nullptr; + return std::make_unique<ProtobufSerializerArray>(std::move(nested_serializer)); + } + + case TypeIndex::Tuple: + { + /// Tuple is serialized in one of two ways: + /// 1) If the tuple has explicit names then it can be serialized as a nested message. + /// 2) Any tuple can be serialized as a repeated field, just like Array. + const auto & tuple_data_type = assert_cast<const DataTypeTuple &>(*data_type); + size_t size_of_tuple = tuple_data_type.getElements().size(); + + if (const auto * message_type = field_descriptor.message_type()) + { + bool have_explicit_names = tuple_data_type.haveExplicitNames(); + Names element_names; + if (have_explicit_names) + { + element_names = tuple_data_type.getElementNames(); + } + else + { + /// Match unnamed Tuple elements and Message fields by position. + size_t field_count = message_type->field_count(); + if (field_count != size_of_tuple) + throw Exception( + ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS, + "The number of fields in Protobuf message ({}) is not equal to the number of elements in unnamed Tuple ({})", + field_count, + size_of_tuple); + for (size_t i = 0; i != field_count; ++i) + element_names.push_back(message_type->field(static_cast<int>(i))->name()); + } + + /// Try to serialize as a nested message. + std::vector<size_t> used_column_indices; + auto message_serializer = buildMessageSerializerImpl( + size_of_tuple, + element_names.data(), + tuple_data_type.getElements().data(), + *message_type, + /* with_length_delimiter = */ false, + google_wrappers_special_treatment, + &field_descriptor, + used_column_indices, + /* columns_are_reordered_outside = */ false, + /* check_nested_while_filling_missing_columns = */ false); + + if (!message_serializer) + { + throw Exception(ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS, + "Not found matches between the names of the tuple's elements ({}) and the fields ({}) " + "of the message {} in the protobuf schema", + boost::algorithm::join(tuple_data_type.getElementNames(), ", "), + boost::algorithm::join(getFieldNames(*field_descriptor.message_type()), ", "), + quoteString(field_descriptor.message_type()->full_name())); + } + + return std::make_unique<ProtobufSerializerTupleAsNestedMessage>(std::move(message_serializer)); + } + + /// Serialize as a repeated field. + if (!allow_repeat && (size_of_tuple > 1)) + throwFieldNotRepeated(field_descriptor, column_name); + + std::vector<std::unique_ptr<ProtobufSerializer>> nested_serializers; + for (const auto & nested_data_type : tuple_data_type.getElements()) + { + auto nested_serializer = buildFieldSerializer(column_name, nested_data_type, field_descriptor, + /* allow_repeat = */ false, // We do our repeating now, so for nested type we forget about the repeating. + google_wrappers_special_treatment); + if (!nested_serializer) + break; + nested_serializers.push_back(std::move(nested_serializer)); + } + + if (nested_serializers.size() != size_of_tuple) + return nullptr; + + return std::make_unique<ProtobufSerializerTupleAsArray>( + column_name, + typeid_cast<std::shared_ptr<const DataTypeTuple>>(data_type), + field_descriptor, + std::move(nested_serializers)); + } + + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not supported in Protobuf format", data_type->getName()); + } + } + + [[noreturn]] static void throwFieldNotRepeated(const FieldDescriptor & field_descriptor, std::string_view column_name) + { + if (!field_descriptor.is_repeated()) + throw Exception(ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED, + "The field {} must be repeated in the protobuf schema to match the column {}", + quoteString(field_descriptor.full_name()), backQuote(StringRef{column_name})); + + throw Exception(ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED, + "The field {} is repeated but the level of repeatedness is not enough " + "to serialize a multidimensional array from the column {}. " + "It's recommended to make the parent field repeated as well.", + quoteString(field_descriptor.full_name()), backQuote(StringRef{column_name})); + } + + const ProtobufReaderOrWriter reader_or_writer; + std::function<String(size_t)> get_root_desc_function; + std::shared_ptr<ProtobufSerializer *> root_serializer_ptr; + }; + + template <typename Type> + DataTypePtr getEnumDataType(const google::protobuf::EnumDescriptor * enum_descriptor) + { + std::vector<std::pair<String, Type>> values; + for (int i = 0; i != enum_descriptor->value_count(); ++i) + { + const auto * enum_value_descriptor = enum_descriptor->value(i); + values.emplace_back(enum_value_descriptor->name(), enum_value_descriptor->number()); + } + return std::make_shared<DataTypeEnum<Type>>(std::move(values)); + } + + std::optional<NameAndTypePair> getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool skip_unsupported_fields, bool allow_repeat = true) + { + if (allow_repeat && field_descriptor->is_map()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, skip_unsupported_fields, false); + if (!name_and_type) + return std::nullopt; + const auto * tuple_type = assert_cast<const DataTypeTuple *>(name_and_type->type.get()); + return NameAndTypePair{name_and_type->name, std::make_shared<DataTypeMap>(tuple_type->getElements())}; + } + + if (allow_repeat && field_descriptor->is_repeated()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, skip_unsupported_fields, false); + if (!name_and_type) + return std::nullopt; + return NameAndTypePair{name_and_type->name, std::make_shared<DataTypeArray>(name_and_type->type)}; + } + + switch (field_descriptor->type()) + { + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_INT32: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeInt32>()}; + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_INT64: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeInt64>()}; + case FieldTypeId::TYPE_BOOL: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeUInt8>()}; + case FieldTypeId::TYPE_FLOAT: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeFloat32>()}; + case FieldTypeId::TYPE_DOUBLE: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeFloat64>()}; + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_FIXED32: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeUInt32>()}; + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED64: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeUInt64>()}; + case FieldTypeId::TYPE_BYTES: + case FieldTypeId::TYPE_STRING: + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeString>()}; + case FieldTypeId::TYPE_ENUM: + { + const auto * enum_descriptor = field_descriptor->enum_type(); + if (enum_descriptor->value_count() == 0) + { + if (skip_unsupported_fields) + return std::nullopt; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Empty enum field"); + } + int max_abs = std::abs(enum_descriptor->value(0)->number()); + for (int i = 1; i != enum_descriptor->value_count(); ++i) + { + if (std::abs(enum_descriptor->value(i)->number()) > max_abs) + max_abs = std::abs(enum_descriptor->value(i)->number()); + } + if (max_abs < 128) + return NameAndTypePair{field_descriptor->name(), getEnumDataType<Int8>(enum_descriptor)}; + else if (max_abs < 32768) + return NameAndTypePair{field_descriptor->name(), getEnumDataType<Int16>(enum_descriptor)}; + else + { + if (skip_unsupported_fields) + return std::nullopt; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse supports only 8-bit and 16-bit enums"); + } + } + case FieldTypeId::TYPE_GROUP: + case FieldTypeId::TYPE_MESSAGE: + { + const auto * message_descriptor = field_descriptor->message_type(); + if (message_descriptor->field_count() == 0) + { + if (skip_unsupported_fields) + return std::nullopt; + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Empty messages are not supported"); + } + else if (message_descriptor->field_count() == 1) + { + const auto * nested_field_descriptor = message_descriptor->field(0); + auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor, skip_unsupported_fields); + if (!nested_name_and_type) + return std::nullopt; + return NameAndTypePair{field_descriptor->name() + "_" + nested_name_and_type->name, nested_name_and_type->type}; + } + else + { + DataTypes nested_types; + Strings nested_names; + for (int i = 0; i != message_descriptor->field_count(); ++i) + { + auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i), skip_unsupported_fields); + if (!nested_name_and_type) + continue; + nested_types.push_back(nested_name_and_type->type); + nested_names.push_back(nested_name_and_type->name); + } + + if (nested_types.empty()) + return std::nullopt; + return NameAndTypePair{field_descriptor->name(), std::make_shared<DataTypeTuple>(std::move(nested_types), std::move(nested_names))}; + } + } + } + + UNREACHABLE(); + } +} + +std::unique_ptr<ProtobufSerializer> ProtobufSerializer::create( + const Strings & column_names, + const DataTypes & data_types, + std::vector<size_t> & missing_column_indices, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + bool with_envelope, + bool flatten_google_wrappers, + ProtobufReader & reader) +{ + return ProtobufSerializerBuilder(reader).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter, with_envelope, flatten_google_wrappers); +} + +std::unique_ptr<ProtobufSerializer> ProtobufSerializer::create( + const Strings & column_names, + const DataTypes & data_types, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + bool with_envelope, + bool defaults_for_nullable_google_wrappers, + ProtobufWriter & writer) +{ + std::vector<size_t> missing_column_indices; + return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter, with_envelope, defaults_for_nullable_google_wrappers); +} + +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor, bool skip_unsupported_fields) +{ + NamesAndTypesList schema; + for (int i = 0; i != message_descriptor->field_count(); ++i) + { + if (auto name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i), skip_unsupported_fields)) + schema.push_back(*name_and_type); + } + if (schema.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot convert Protobuf schema to ClickHouse table schema, all fields have unsupported types"); + return schema; +} + +} +#endif diff --git a/contrib/clickhouse/src/Formats/ProtobufSerializer.h b/contrib/clickhouse/src/Formats/ProtobufSerializer.h new file mode 100644 index 0000000000..008e08416d --- /dev/null +++ b/contrib/clickhouse/src/Formats/ProtobufSerializer.h @@ -0,0 +1,61 @@ +#pragma once + +#include "clickhouse_config.h" + +#if USE_PROTOBUF +# include <Columns/IColumn.h> +#include <Core/NamesAndTypes.h> + + +namespace google::protobuf { class Descriptor; } + +namespace DB +{ +class ProtobufReader; +class ProtobufWriter; +class IDataType; +using DataTypePtr = std::shared_ptr<const IDataType>; +using DataTypes = std::vector<DataTypePtr>; +class WriteBuffer; + +/// Utility class, does all the work for serialization in the Protobuf format. +class ProtobufSerializer +{ +public: + virtual ~ProtobufSerializer() = default; + + virtual void setColumns(const ColumnPtr * columns, size_t num_columns) = 0; + virtual void writeRow(size_t row_num) = 0; + virtual void finalizeWrite() {} + virtual void reset() {} + + virtual void setColumns(const MutableColumnPtr * columns, size_t num_columns) = 0; + virtual void readRow(size_t row_num) = 0; + virtual void insertDefaults(size_t row_num) = 0; + + virtual void describeTree(WriteBuffer & out, size_t indent) const = 0; + + static std::unique_ptr<ProtobufSerializer> create( + const Strings & column_names, + const DataTypes & data_types, + std::vector<size_t> & missing_column_indices, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + bool with_envelope, + bool flatten_google_wrappers, + ProtobufReader & reader); + + static std::unique_ptr<ProtobufSerializer> create( + const Strings & column_names, + const DataTypes & data_types, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + bool with_envelope, + bool defaults_for_nullable_google_wrappers, + ProtobufWriter & writer); +}; + +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor, bool skip_unsupported_fields); + +} +#endif diff --git a/contrib/clickhouse/src/Formats/ProtobufWriter.cpp b/contrib/clickhouse/src/Formats/ProtobufWriter.cpp new file mode 100644 index 0000000000..da680fae60 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ProtobufWriter.cpp @@ -0,0 +1,248 @@ +#include "ProtobufWriter.h" + +#if USE_PROTOBUF +# include <IO/WriteHelpers.h> + + +namespace DB +{ +namespace +{ + constexpr size_t MAX_VARINT_SIZE = 10; + constexpr size_t REPEATED_PACK_PADDING = 2 * MAX_VARINT_SIZE; + constexpr size_t NESTED_MESSAGE_PADDING = 2 * MAX_VARINT_SIZE; + + // Note: There is a difference between this function and writeVarUInt() from IO/VarInt.h: + // Google protobuf's representation of 64-bit integer contains from 1 to 10 bytes, + // whileas writeVarUInt() writes from 1 to 9 bytes because it omits the tenth byte (which is not necessary to decode actually). + void writeVarint(UInt64 value, WriteBuffer & out) + { + while (value >= 0x80) + { + out.write(static_cast<char>(value | 0x80)); + value >>= 7; + } + out.write(static_cast<char>(value)); + } + + UInt8 * writeVarint(UInt64 value, UInt8 * ptr) + { + while (value >= 0x80) + { + *ptr++ = static_cast<UInt8>(value | 0x80); + value >>= 7; + } + *ptr++ = static_cast<UInt8>(value); + return ptr; + } + + void writeVarint(UInt64 value, PODArray<UInt8> & buf) + { + size_t old_size = buf.size(); + buf.reserve(old_size + MAX_VARINT_SIZE); + UInt8 * ptr = buf.data() + old_size; + ptr = writeVarint(value, ptr); + buf.resize_assume_reserved(ptr - buf.data()); + } + + UInt64 encodeZigZag(Int64 value) { return (static_cast<UInt64>(value) << 1) ^ static_cast<UInt64>(value >> 63); } + + enum WireType + { + VARINT = 0, + BITS64 = 1, + LENGTH_DELIMITED = 2, + GROUP_START = 3, + GROUP_END = 4, + BITS32 = 5 + }; + + UInt8 * writeFieldNumber(UInt32 field_number, WireType wire_type, UInt8 * ptr) + { + return writeVarint((field_number << 3) | wire_type, ptr); + } + + void writeFieldNumber(UInt32 field_number, WireType wire_type, PODArray<UInt8> & buf) { writeVarint((field_number << 3) | wire_type, buf); } +} + + +ProtobufWriter::ProtobufWriter(WriteBuffer & out_) + : out(out_) +{ +} + +ProtobufWriter::~ProtobufWriter() = default; + +void ProtobufWriter::startMessage() +{ +} + +void ProtobufWriter::endMessage(bool with_length_delimiter) +{ + pieces.emplace_back(current_piece_start, buffer.size()); + if (with_length_delimiter) + { + size_t size_of_message = buffer.size() - num_bytes_skipped; + writeVarint(size_of_message, out); + } + for (const auto & piece : pieces) + if (piece.end > piece.start) + out.write(reinterpret_cast<char *>(&buffer[piece.start]), piece.end - piece.start); + buffer.clear(); + pieces.clear(); + num_bytes_skipped = 0; + current_piece_start = 0; +} + +void ProtobufWriter::startNestedMessage() +{ + nested_infos.emplace_back(pieces.size(), num_bytes_skipped); + pieces.emplace_back(current_piece_start, buffer.size()); + + // We skip enough bytes to have place for inserting the field number and the size of the nested message afterwards + // when we finish writing the nested message itself. We don't know the size of the nested message at the point of + // calling startNestedMessage(), that's why we have to do this skipping. + current_piece_start = buffer.size() + NESTED_MESSAGE_PADDING; + buffer.resize(current_piece_start); + num_bytes_skipped = NESTED_MESSAGE_PADDING; +} + +void ProtobufWriter::endNestedMessage(int field_number, bool is_group, bool skip_if_empty) +{ + const auto & nested_info = nested_infos.back(); + size_t num_pieces_at_start = nested_info.num_pieces_at_start; + size_t num_bytes_skipped_at_start = nested_info.num_bytes_skipped_at_start; + nested_infos.pop_back(); + auto & piece_before_message = pieces[num_pieces_at_start]; + size_t message_start = piece_before_message.end; + size_t message_size = buffer.size() - message_start - num_bytes_skipped; + if (!message_size && skip_if_empty) + { + current_piece_start = piece_before_message.start; + buffer.resize(piece_before_message.end); + pieces.resize(num_pieces_at_start); + num_bytes_skipped = num_bytes_skipped_at_start; + return; + } + size_t num_bytes_inserted; + if (is_group) + { + writeFieldNumber(field_number, GROUP_END, buffer); + UInt8 * ptr = &buffer[piece_before_message.end]; + UInt8 * endptr = writeFieldNumber(field_number, GROUP_START, ptr); + num_bytes_inserted = endptr - ptr; + } + else + { + UInt8 * ptr = &buffer[piece_before_message.end]; + UInt8 * endptr = writeFieldNumber(field_number, LENGTH_DELIMITED, ptr); + endptr = writeVarint(message_size, endptr); + num_bytes_inserted = endptr - ptr; + } + piece_before_message.end += num_bytes_inserted; + num_bytes_skipped += num_bytes_skipped_at_start - num_bytes_inserted; +} + +void ProtobufWriter::writeUInt(int field_number, UInt64 value) +{ + if (in_repeated_pack) + { + writeVarint(value, buffer); + return; + } + size_t old_size = buffer.size(); + buffer.reserve(old_size + 2 * MAX_VARINT_SIZE); + UInt8 * ptr = buffer.data() + old_size; + ptr = writeFieldNumber(field_number, VARINT, ptr); + ptr = writeVarint(value, ptr); + buffer.resize_assume_reserved(ptr - buffer.data()); +} + +void ProtobufWriter::writeInt(int field_number, Int64 value) +{ + writeUInt(field_number, static_cast<UInt64>(value)); +} + +void ProtobufWriter::writeSInt(int field_number, Int64 value) +{ + writeUInt(field_number, encodeZigZag(value)); +} + +template <typename T> +void ProtobufWriter::writeFixed(int field_number, T value) +{ + static_assert((sizeof(T) == 4) || (sizeof(T) == 8)); + if (in_repeated_pack) + { + size_t old_size = buffer.size(); + buffer.resize(old_size + sizeof(T)); + memcpy(buffer.data() + old_size, &value, sizeof(T)); + return; + } + constexpr WireType wire_type = (sizeof(T) == 4) ? BITS32 : BITS64; + size_t old_size = buffer.size(); + buffer.reserve(old_size + MAX_VARINT_SIZE + sizeof(T)); + UInt8 * ptr = buffer.data() + old_size; + ptr = writeFieldNumber(field_number, wire_type, ptr); + memcpy(ptr, &value, sizeof(T)); + ptr += sizeof(T); + buffer.resize_assume_reserved(ptr - buffer.data()); +} + +template void ProtobufWriter::writeFixed<Int32>(int field_number, Int32 value); +template void ProtobufWriter::writeFixed<UInt32>(int field_number, UInt32 value); +template void ProtobufWriter::writeFixed<Int64>(int field_number, Int64 value); +template void ProtobufWriter::writeFixed<UInt64>(int field_number, UInt64 value); +template void ProtobufWriter::writeFixed<Float32>(int field_number, Float32 value); +template void ProtobufWriter::writeFixed<Float64>(int field_number, Float64 value); + +void ProtobufWriter::writeString(int field_number, std::string_view str) +{ + size_t length = str.length(); + size_t old_size = buffer.size(); + buffer.reserve(old_size + 2 * MAX_VARINT_SIZE + length); + UInt8 * ptr = buffer.data() + old_size; + ptr = writeFieldNumber(field_number, LENGTH_DELIMITED, ptr); + ptr = writeVarint(length, ptr); + memcpy(ptr, str.data(), length); + ptr += length; + buffer.resize_assume_reserved(ptr - buffer.data()); +} + +void ProtobufWriter::startRepeatedPack() +{ + pieces.emplace_back(current_piece_start, buffer.size()); + + // We skip enough bytes to have place for inserting the field number and the size of the repeated pack afterwards + // when we finish writing the repeated pack itself. We don't know the size of the repeated pack at the point of + // calling startRepeatedPack(), that's why we have to do this skipping. + current_piece_start = buffer.size() + REPEATED_PACK_PADDING; + buffer.resize(current_piece_start); + num_bytes_skipped += REPEATED_PACK_PADDING; + in_repeated_pack = true; +} + +void ProtobufWriter::endRepeatedPack(int field_number, bool skip_if_empty) +{ + size_t size = buffer.size() - current_piece_start; + if (!size && skip_if_empty) + { + current_piece_start = pieces.back().start; + buffer.resize(pieces.back().end); + pieces.pop_back(); + num_bytes_skipped -= REPEATED_PACK_PADDING; + in_repeated_pack = false; + return; + } + UInt8 * ptr = &buffer[pieces.back().end]; + UInt8 * endptr = writeFieldNumber(field_number, LENGTH_DELIMITED, ptr); + endptr = writeVarint(size, endptr); + size_t num_bytes_inserted = endptr - ptr; + pieces.back().end += num_bytes_inserted; + num_bytes_skipped -= num_bytes_inserted; + in_repeated_pack = false; +} + +} + +#endif diff --git a/contrib/clickhouse/src/Formats/ProtobufWriter.h b/contrib/clickhouse/src/Formats/ProtobufWriter.h new file mode 100644 index 0000000000..4fd10ba09b --- /dev/null +++ b/contrib/clickhouse/src/Formats/ProtobufWriter.h @@ -0,0 +1,67 @@ +#pragma once + +#include "clickhouse_config.h" + +#if USE_PROTOBUF +# include <Core/Types.h> +# include <Common/PODArray.h> + + +namespace DB +{ +class WriteBuffer; + +/// Utility class for writing in the Protobuf format. +/// Knows nothing about protobuf schemas, just provides useful functions to serialize data. +class ProtobufWriter +{ +public: + explicit ProtobufWriter(WriteBuffer & out_); + ~ProtobufWriter(); + + void startMessage(); + void endMessage(bool with_length_delimiter); + + void startNestedMessage(); + void endNestedMessage(int field_number, bool is_group, bool skip_if_empty); + + void writeInt(int field_number, Int64 value); + void writeUInt(int field_number, UInt64 value); + void writeSInt(int field_number, Int64 value); + template <typename T> + void writeFixed(int field_number, T value); + void writeString(int field_number, std::string_view str); + + void startRepeatedPack(); + void endRepeatedPack(int field_number, bool skip_if_empty); + +private: + struct Piece + { + size_t start; + size_t end; + Piece(size_t start_, size_t end_) : start(start_), end(end_) {} + Piece() = default; + }; + + struct NestedInfo + { + size_t num_pieces_at_start; + size_t num_bytes_skipped_at_start; + NestedInfo(size_t num_pieces_at_start_, size_t num_bytes_skipped_at_start_) + : num_pieces_at_start(num_pieces_at_start_), num_bytes_skipped_at_start(num_bytes_skipped_at_start_) + { + } + }; + + WriteBuffer & out; + PODArray<UInt8> buffer; + std::vector<Piece> pieces; + size_t current_piece_start = 0; + size_t num_bytes_skipped = 0; + std::vector<NestedInfo> nested_infos; + bool in_repeated_pack = false; +}; + +} +#endif diff --git a/contrib/clickhouse/src/Formats/ReadSchemaUtils.cpp b/contrib/clickhouse/src/Formats/ReadSchemaUtils.cpp new file mode 100644 index 0000000000..bd75dd13f1 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ReadSchemaUtils.cpp @@ -0,0 +1,272 @@ +#include <DataTypes/DataTypeMap.h> +#include <Formats/ReadSchemaUtils.h> +#include <Interpreters/Context.h> +#include <Processors/Formats/ISchemaReader.h> +#include <Storages/IStorage.h> +#include <Common/assert_cast.h> +#include <IO/WithFileName.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int EMPTY_DATA_PASSED; + extern const int BAD_ARGUMENTS; + extern const int ONLY_NULLS_WHILE_READING_SCHEMA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} + +static std::optional<NamesAndTypesList> getOrderedColumnsList(const NamesAndTypesList & columns_list, const Names & columns_order_hint) +{ + if (columns_list.size() != columns_order_hint.size()) + return {}; + + std::unordered_map<String, DataTypePtr> available_columns; + for (const auto & [name, type] : columns_list) + available_columns.emplace(name, type); + + NamesAndTypesList res; + for (const auto & name : columns_order_hint) + { + auto it = available_columns.find(name); + if (it == available_columns.end()) + return {}; + + res.emplace_back(name, it->second); + } + return res; +} + +bool isRetryableSchemaInferenceError(int code) +{ + return code == ErrorCodes::EMPTY_DATA_PASSED || code == ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA; +} + +ColumnsDescription readSchemaFromFormat( + const String & format_name, + const std::optional<FormatSettings> & format_settings, + IReadBufferIterator & read_buffer_iterator, + bool retry, + ContextPtr & context, + std::unique_ptr<ReadBuffer> & buf) +try +{ + NamesAndTypesList names_and_types; + if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + { + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + try + { + names_and_types = external_schema_reader->readSchema(); + } + catch (Exception & e) + { + e.addMessage( + fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + throw; + } + } + else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) + { + std::string exception_messages; + SchemaReaderPtr schema_reader; + size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference + : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference; + size_t max_bytes_to_read = format_settings ? format_settings->max_bytes_to_read_for_schema_inference + : context->getSettingsRef().input_format_max_bytes_to_read_for_schema_inference; + size_t iterations = 0; + while (true) + { + bool is_eof = false; + try + { + buf = read_buffer_iterator.next(); + if (!buf) + break; + is_eof = buf->eof(); + } + catch (Exception & e) + { + e.addMessage( + fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + throw; + } + catch (...) + { + auto exception_message = getCurrentExceptionMessage(false); + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file:\n{}\nYou can specify the structure manually", + format_name, + exception_message); + } + + ++iterations; + + if (is_eof) + { + auto exception_message = fmt::format("Cannot extract table structure from {} format file, file is empty", format_name); + + if (!retry) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message); + + exception_messages += "\n" + exception_message; + continue; + } + + try + { + schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + names_and_types = schema_reader->readSchema(); + auto num_rows = schema_reader->readNumberOrRows(); + if (num_rows) + read_buffer_iterator.setNumRowsToLastFile(*num_rows); + break; + } + catch (...) + { + auto exception_message = getCurrentExceptionMessage(false); + if (schema_reader) + { + size_t rows_read = schema_reader->getNumRowsRead(); + assert(rows_read <= max_rows_to_read); + max_rows_to_read -= schema_reader->getNumRowsRead(); + size_t bytes_read = buf->count(); + /// We could exceed max_bytes_to_read a bit to complete row parsing. + max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); + if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) + { + exception_message += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " + "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + + if (iterations > 1) + { + exception_messages += "\n" + exception_message; + break; + } + retry = false; + } + } + + if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) + { + try + { + throw; + } + catch (Exception & e) + { + e.addMessage(fmt::format( + "Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + throw; + } + catch (...) + { + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file. " + "Error: {}. You can specify the structure manually", + format_name, + exception_message); + } + } + + exception_messages += "\n" + exception_message; + } + } + + if (auto cached_columns = read_buffer_iterator.getCachedColumns()) + return *cached_columns; + + if (names_and_types.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "All attempts to extract table structure from files failed. " + "Errors:{}\nYou can specify the structure manually", + exception_messages); + + /// If we have "INSERT SELECT" query then try to order + /// columns as they are ordered in table schema for formats + /// without strict column order (like JSON and TSKV). + /// It will allow to execute simple data loading with query + /// "INSERT INTO table SELECT * FROM ..." + const auto & insertion_table = context->getInsertionTable(); + if (!schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) + { + auto storage = DatabaseCatalog::instance().getTable(insertion_table, context); + auto metadata = storage->getInMemoryMetadataPtr(); + auto names_in_storage = metadata->getColumns().getNamesOfPhysical(); + auto ordered_list = getOrderedColumnsList(names_and_types, names_in_storage); + if (ordered_list) + names_and_types = *ordered_list; + } + } + else + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "{} file format doesn't support schema inference. You must specify the structure manually", + format_name); + + /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. + names_and_types.erase( + std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), + names_and_types.end()); + return ColumnsDescription(names_and_types); +} +catch (Exception & e) +{ + if (!buf) + throw; + auto file_name = getFileNameFromReadBuffer(*buf); + if (!file_name.empty()) + e.addMessage(fmt::format("(in file/uri {})", file_name)); + throw; +} + + +ColumnsDescription readSchemaFromFormat( + const String & format_name, + const std::optional<FormatSettings> & format_settings, + IReadBufferIterator & read_buffer_iterator, + bool retry, + ContextPtr & context) +{ + std::unique_ptr<ReadBuffer> buf_out; + return readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, retry, context, buf_out); +} + +SchemaCache::Key getKeyForSchemaCache( + const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context) +{ + return getKeysForSchemaCache({source}, format, format_settings, context).front(); +} + +static SchemaCache::Key makeSchemaCacheKey(const String & source, const String & format, const String & additional_format_info) +{ + return SchemaCache::Key{source, format, additional_format_info}; +} + +SchemaCache::Keys getKeysForSchemaCache( + const Strings & sources, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context) +{ + /// For some formats data schema depends on some settings, so it's possible that + /// two queries to the same source will get two different schemas. To process this + /// case we add some additional information specific for the format to the cache key. + /// For example, for Protobuf format additional information is the path to the schema + /// and message name. + String additional_format_info = FormatFactory::instance().getAdditionalInfoForSchemaCache(format, context, format_settings); + SchemaCache::Keys cache_keys; + cache_keys.reserve(sources.size()); + std::transform( + sources.begin(), + sources.end(), + std::back_inserter(cache_keys), + [&](const auto & source) { return makeSchemaCacheKey(source, format, additional_format_info); }); + return cache_keys; +} + +} diff --git a/contrib/clickhouse/src/Formats/ReadSchemaUtils.h b/contrib/clickhouse/src/Formats/ReadSchemaUtils.h new file mode 100644 index 0000000000..e6a1944e06 --- /dev/null +++ b/contrib/clickhouse/src/Formats/ReadSchemaUtils.h @@ -0,0 +1,70 @@ +#pragma once + +#include <Formats/FormatFactory.h> +#include <Storages/Cache/SchemaCache.h> +#include <Storages/ColumnsDescription.h> + +namespace DB +{ + +struct IReadBufferIterator +{ + virtual ~IReadBufferIterator() = default; + + virtual std::unique_ptr<ReadBuffer> next() = 0; + + virtual std::optional<ColumnsDescription> getCachedColumns() { return std::nullopt; } + + virtual void setNumRowsToLastFile(size_t /*num_rows*/) {} +}; + +struct SingleReadBufferIterator : public IReadBufferIterator +{ +public: + SingleReadBufferIterator(std::unique_ptr<ReadBuffer> buf_) : buf(std::move(buf_)) + { + } + + std::unique_ptr<ReadBuffer> next() override + { + if (done) + return nullptr; + done = true; + return std::move(buf); + } + +private: + std::unique_ptr<ReadBuffer> buf; + bool done = false; +}; + +/// Try to determine the schema of the data and number of rows in data in the specified format. +/// For formats that have an external schema reader, it will +/// use it and won't create a read buffer. +/// For formats that have a schema reader from the data, +/// read buffer will be created by the provided iterator and +/// the schema will be extracted from the data. If schema reader +/// couldn't determine the schema we will try the next read buffer +/// from the provided iterator if it makes sense. If the format doesn't +/// have any schema reader or we couldn't determine the schema, +/// an exception will be thrown. +ColumnsDescription readSchemaFromFormat( + const String & format_name, + const std::optional<FormatSettings> & format_settings, + IReadBufferIterator & read_buffer_iterator, + bool retry, + ContextPtr & context); + +/// If ReadBuffer is created, it will be written to buf_out. +ColumnsDescription readSchemaFromFormat( + const String & format_name, + const std::optional<FormatSettings> & format_settings, + IReadBufferIterator & read_buffer_iterator, + bool retry, + ContextPtr & context, + std::unique_ptr<ReadBuffer> & buf_out); + +SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context); +SchemaCache::Keys getKeysForSchemaCache(const Strings & sources, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context); + +} diff --git a/contrib/clickhouse/src/Formats/RowInputMissingColumnsFiller.cpp b/contrib/clickhouse/src/Formats/RowInputMissingColumnsFiller.cpp new file mode 100644 index 0000000000..e1b2b360ab --- /dev/null +++ b/contrib/clickhouse/src/Formats/RowInputMissingColumnsFiller.cpp @@ -0,0 +1,144 @@ +#include <Formats/RowInputMissingColumnsFiller.h> +#include <Columns/ColumnArray.h> +#include <DataTypes/NestedUtils.h> +#include <boost/range/adaptor/map.hpp> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller() = default; + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller(const NamesAndTypesList & names_and_types) +{ + std::unordered_map<std::string_view, std::vector<size_t>> nested_groups; /// Nested prefix -> column indices. + size_t i = 0; + for (auto it = names_and_types.begin(); it != names_and_types.end(); ++it, ++i) + { + const auto & name_and_type = *it; + if (isArray(name_and_type.type)) + { + auto split = Nested::splitName(name_and_type.name); + if (!split.second.empty()) /// Is it really a column of Nested data structure? + nested_groups[split.first].push_back(i); + } + } + setNestedGroups(std::move(nested_groups), names_and_types.size()); +} + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller(const Names & names, const DataTypes & types) +{ + std::unordered_map<std::string_view, std::vector<size_t>> nested_groups; /// Nested prefix -> column indices. + for (size_t i = 0; i != names.size(); ++i) + { + if (isArray(types[i])) + { + auto split = Nested::splitName(names[i]); + if (!split.second.empty()) /// Is it really a column of Nested data structure? + nested_groups[split.first].push_back(i); + } + } + setNestedGroups(std::move(nested_groups), names.size()); +} + +RowInputMissingColumnsFiller::RowInputMissingColumnsFiller(size_t count, const std::string_view * names, const DataTypePtr * types) +{ + std::unordered_map<std::string_view, std::vector<size_t>> nested_groups; /// Nested prefix -> column indices. + for (size_t i = 0; i != count; ++i) + { + if (isArray(types[i])) + { + auto split = Nested::splitName(names[i]); + if (!split.second.empty()) /// Is it really a column of Nested data structure? + nested_groups[split.first].push_back(i); + } + } + setNestedGroups(std::move(nested_groups), count); +} + +void RowInputMissingColumnsFiller::setNestedGroups(std::unordered_map<std::string_view, std::vector<size_t>> && nested_groups, size_t num_columns) +{ + if (!nested_groups.empty()) + { + column_infos.resize(num_columns); + for (auto & nested_group : nested_groups | boost::adaptors::map_values) + { + if (nested_group.size() <= 1) + continue; + auto nested_group_shared = std::make_shared<std::vector<size_t>>(std::move(nested_group)); + for (size_t i : *nested_group_shared) + column_infos[i].nested_group = nested_group_shared; + } + } +} + + +void RowInputMissingColumnsFiller::addDefaults(MutableColumns & columns, size_t row_num) const +{ + for (size_t i = 0; i != columns.size(); ++i) + { + auto & column = *columns[i]; + size_t column_size = column.size(); + if (row_num < column_size) + continue; /// The column already has an element in this position, skipping. + + if (row_num > column_size) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Wrong row_number {}, expected either {} or {}", row_num, column_size - 1, column_size); + + if ((i >= column_infos.size()) || !column_infos[i].nested_group) + { + column.insertDefault(); + continue; + } + + const auto & nested_group = *column_infos[i].nested_group; + size_t size_of_array = 0; + for (size_t j : nested_group) + { + const auto & column_j = columns[j]; + size_t column_size_j = column_j->size(); + if (row_num < column_size_j) + { + const auto * column_array = typeid_cast<const ColumnArray *>(column_j.get()); + if (!column_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Column with Array type is not represented by ColumnArray column: {}", + column_j->dumpStructure()); + const auto & offsets = column_array->getOffsets(); + size_of_array = offsets[row_num] - offsets[row_num - 1]; + break; + } + } + + for (size_t j : nested_group) + { + auto & column_j = columns[j]; + size_t column_size_j = column_j->size(); + if (row_num >= column_size_j) + { + if (row_num > column_size_j) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Wrong row_number {}, expected either {} or {}", row_num, column_size_j - 1, column_size_j); + + auto * column_array = typeid_cast<ColumnArray *>(column_j.get()); + if (!column_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Column with Array type is not represented by ColumnArray column: {}", + column_j->dumpStructure()); + + auto & data = column_array->getData(); + auto & offsets = column_array->getOffsets(); + for (size_t k = 0; k != size_of_array; ++k) + data.insertDefault(); + offsets.push_back(data.size()); + } + } + } +} + +} diff --git a/contrib/clickhouse/src/Formats/RowInputMissingColumnsFiller.h b/contrib/clickhouse/src/Formats/RowInputMissingColumnsFiller.h new file mode 100644 index 0000000000..9785d8bed6 --- /dev/null +++ b/contrib/clickhouse/src/Formats/RowInputMissingColumnsFiller.h @@ -0,0 +1,40 @@ +#pragma once + +#include <Core/NamesAndTypes.h> + + +namespace DB +{ + +/// Adds default values to columns if they don't have a specified row yet. +/// This class can be useful for implementing IRowInputFormat. +/// For missing columns of nested structure, it creates not columns of empty arrays, +/// but columns of arrays of correct lengths. +class RowInputMissingColumnsFiller +{ +public: + /// Makes a column filler which checks nested structures while adding default values to columns. + explicit RowInputMissingColumnsFiller(const NamesAndTypesList & names_and_types); + RowInputMissingColumnsFiller(const Names & names, const DataTypes & types); + RowInputMissingColumnsFiller(size_t count, const std::string_view * names, const DataTypePtr * types); + + /// Default constructor makes a column filler which doesn't check nested structures while + /// adding default values to columns. + RowInputMissingColumnsFiller(); + + /// Adds default values to some columns. + /// For each column the function checks the number of rows and if it's less than (row_num + 1) + /// the function will add a default value to this column. + void addDefaults(MutableColumns & columns, size_t row_num) const; + +private: + void setNestedGroups(std::unordered_map<std::string_view, std::vector<size_t>> && nested_groups, size_t num_columns); + + struct ColumnInfo + { + std::shared_ptr<std::vector<size_t>> nested_group; + }; + std::vector<ColumnInfo> column_infos; +}; + +} diff --git a/contrib/clickhouse/src/Formats/SchemaInferenceUtils.cpp b/contrib/clickhouse/src/Formats/SchemaInferenceUtils.cpp new file mode 100644 index 0000000000..011860948c --- /dev/null +++ b/contrib/clickhouse/src/Formats/SchemaInferenceUtils.cpp @@ -0,0 +1,1155 @@ +#include <Formats/SchemaInferenceUtils.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeDateTime64.h> +#include <DataTypes/DataTypeDateTime.h> +#include <DataTypes/DataTypeDate.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/transformTypesRecursively.h> +#include <DataTypes/DataTypeObject.h> +#include <DataTypes/DataTypeFactory.h> +#include <IO/ReadBufferFromString.h> +#include <IO/ReadHelpers.h> +#include <IO/parseDateTimeBestEffort.h> +#include <IO/PeekableReadBuffer.h> + +#include <Core/Block.h> +#include <Common/assert_cast.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TOO_DEEP_RECURSION; +} + +namespace +{ + bool checkIfTypesAreEqual(const DataTypes & types) + { + if (types.empty()) + return true; + + for (size_t i = 1; i < types.size(); ++i) + { + if (!types[0]->equals(*types[i])) + return false; + } + return true; + } + + void updateTypeIndexes(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + type_indexes.clear(); + for (const auto & type : data_types) + type_indexes.insert(type->getTypeId()); + } + + /// If we have both Nothing and non Nothing types, convert all Nothing types to the first non Nothing. + /// For example if we have types [Nothing, String, Nothing] we change it to [String, String, String] + void transformNothingSimpleTypes(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + /// Check if we have both Nothing and non Nothing types. + if (!type_indexes.contains(TypeIndex::Nothing) || type_indexes.size() <= 1) + return; + + DataTypePtr not_nothing_type = nullptr; + for (const auto & type : data_types) + { + if (!isNothing(type)) + { + not_nothing_type = type; + break; + } + } + + for (auto & type : data_types) + { + if (isNothing(type)) + type = not_nothing_type; + } + + type_indexes.erase(TypeIndex::Nothing); + } + + /// If we have both Int64 and UInt64, convert all Int64 to UInt64, + /// because UInt64 is inferred only in case of Int64 overflow. + void transformIntegers(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + if (!type_indexes.contains(TypeIndex::Int64) || !type_indexes.contains(TypeIndex::UInt64)) + return; + + for (auto & type : data_types) + { + if (WhichDataType(type).isInt64()) + type = std::make_shared<DataTypeUInt64>(); + } + + type_indexes.erase(TypeIndex::Int64); + } + + /// If we have both Int64 and Float64 types, convert all Int64 to Float64. + void transformIntegersAndFloatsToFloats(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + bool have_floats = type_indexes.contains(TypeIndex::Float64); + bool have_integers = type_indexes.contains(TypeIndex::Int64) || type_indexes.contains(TypeIndex::UInt64); + if (!have_integers || !have_floats) + return; + + for (auto & type : data_types) + { + WhichDataType which(type); + if (which.isInt64() || which.isUInt64()) + type = std::make_shared<DataTypeFloat64>(); + } + + type_indexes.erase(TypeIndex::Int64); + type_indexes.erase(TypeIndex::UInt64); + } + + /// If we have only Date and DateTime types, convert Date to DateTime, + /// otherwise, convert all Date and DateTime to String. + void transformDatesAndDateTimes(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + bool have_dates = type_indexes.contains(TypeIndex::Date); + bool have_datetimes = type_indexes.contains(TypeIndex::DateTime64); + bool all_dates_or_datetimes = (type_indexes.size() == (static_cast<size_t>(have_dates) + static_cast<size_t>(have_datetimes))); + + if (!all_dates_or_datetimes && (have_dates || have_datetimes)) + { + for (auto & type : data_types) + { + if (isDate(type) || isDateTime64(type)) + type = std::make_shared<DataTypeString>(); + } + + type_indexes.erase(TypeIndex::Date); + type_indexes.erase(TypeIndex::DateTime); + type_indexes.insert(TypeIndex::String); + return; + } + + if (have_dates && have_datetimes) + { + for (auto & type : data_types) + { + if (isDate(type)) + type = std::make_shared<DataTypeDateTime64>(9); + } + + type_indexes.erase(TypeIndex::Date); + } + } + + /// If we have numbers (Int64/UInt64/Float64) and String types and numbers were parsed from String, + /// convert all numbers to String. + void transformJSONNumbersBackToString( + DataTypes & data_types, const FormatSettings & settings, TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info) + { + bool have_strings = type_indexes.contains(TypeIndex::String); + bool have_numbers = type_indexes.contains(TypeIndex::Int64) || type_indexes.contains(TypeIndex::UInt64) || type_indexes.contains(TypeIndex::Float64); + if (!have_strings || !have_numbers) + return; + + for (auto & type : data_types) + { + if (isNumber(type) + && (settings.json.read_numbers_as_strings || !json_info + || json_info->numbers_parsed_from_json_strings.contains(type.get()))) + type = std::make_shared<DataTypeString>(); + } + + updateTypeIndexes(data_types, type_indexes); + } + + /// If we have both Bool and number (Int64/UInt64/Float64) types, + /// convert all Bool to Int64/UInt64/Float64. + void transformBoolsAndNumbersToNumbers(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + bool have_floats = type_indexes.contains(TypeIndex::Float64); + bool have_signed_integers = type_indexes.contains(TypeIndex::Int64); + bool have_unsigned_integers = type_indexes.contains(TypeIndex::UInt64); + bool have_bools = type_indexes.contains(TypeIndex::UInt8); + /// Check if we have both Bool and Integer/Float. + if (!have_bools || (!have_signed_integers && !have_unsigned_integers && !have_floats)) + return; + + for (auto & type : data_types) + { + if (isBool(type)) + { + if (have_signed_integers) + type = std::make_shared<DataTypeInt64>(); + else if (have_unsigned_integers) + type = std::make_shared<DataTypeUInt64>(); + else + type = std::make_shared<DataTypeFloat64>(); + } + } + + type_indexes.erase(TypeIndex::UInt8); + } + + /// If we have type Nothing/Nullable(Nothing) and some other non Nothing types, + /// convert all Nothing/Nullable(Nothing) types to the first non Nothing. + /// For example, when we have [Nothing, Array(Int64)] it will convert it to [Array(Int64), Array(Int64)] + /// (it can happen when transforming complex nested types like [Array(Nothing), Array(Array(Int64))]) + void transformNothingComplexTypes(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + bool have_nothing = false; + DataTypePtr not_nothing_type = nullptr; + for (const auto & type : data_types) + { + if (isNothing(removeNullable(type))) + have_nothing = true; + else + not_nothing_type = type; + } + + if (!have_nothing || !not_nothing_type) + return; + + for (auto & type : data_types) + { + if (isNothing(removeNullable(type))) + type = not_nothing_type; + } + + updateTypeIndexes(data_types, type_indexes); + } + + /// If we have both Nullable and non Nullable types, make all types Nullable + void transformNullableTypes(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + if (!type_indexes.contains(TypeIndex::Nullable)) + return; + + for (auto & type : data_types) + { + if (type->canBeInsideNullable()) + type = makeNullable(type); + } + + updateTypeIndexes(data_types, type_indexes); + } + + /// If we have Tuple with the same nested types like Tuple(Int64, Int64), + /// convert it to Array(Int64). It's used for JSON values. + /// For example when we had type Tuple(Int64, Nullable(Nothing)) and we + /// transformed it to Tuple(Nullable(Int64), Nullable(Int64)) we will + /// also transform it to Array(Nullable(Int64)) + void transformTuplesWithEqualNestedTypesToArrays(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + if (!type_indexes.contains(TypeIndex::Tuple)) + return; + + bool remove_tuple_index = true; + for (auto & type : data_types) + { + if (isTuple(type)) + { + const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get()); + if (checkIfTypesAreEqual(tuple_type->getElements())) + type = std::make_shared<DataTypeArray>(tuple_type->getElements().back()); + else + remove_tuple_index = false; + } + } + + if (remove_tuple_index) + type_indexes.erase(TypeIndex::Tuple); + } + + template <bool is_json> + void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info = nullptr); + + /// If we have Tuple and Array types, try to convert them all to Array + /// if there is a common type for all nested types. + /// For example, if we have [Tuple(Nullable(Nothing), String), Array(Date), Tuple(Date, String)] + /// it will convert them all to Array(String) + void transformJSONTuplesAndArraysToArrays( + DataTypes & data_types, const FormatSettings & settings, TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info) + { + if (!type_indexes.contains(TypeIndex::Tuple)) + return; + + bool have_arrays = type_indexes.contains(TypeIndex::Array); + bool tuple_sizes_are_equal = true; + size_t tuple_size = 0; + for (const auto & type : data_types) + { + if (isTuple(type)) + { + const auto & current_tuple_size = assert_cast<const DataTypeTuple &>(*type).getElements().size(); + if (!tuple_size) + tuple_size = current_tuple_size; + else + tuple_sizes_are_equal &= current_tuple_size == tuple_size; + } + } + + /// Check if we have arrays and tuples with same size. + if (!have_arrays && !tuple_sizes_are_equal) + return; + + DataTypes nested_types; + for (auto & type : data_types) + { + if (isArray(type)) + nested_types.push_back(assert_cast<const DataTypeArray &>(*type).getNestedType()); + else if (isTuple(type)) + { + const auto & elements = assert_cast<const DataTypeTuple &>(*type).getElements(); + for (const auto & element : elements) + nested_types.push_back(element); + } + } + + transformInferredTypesIfNeededImpl<true>(nested_types, settings, json_info); + if (checkIfTypesAreEqual(nested_types)) + { + for (auto & type : data_types) + { + if (isArray(type) || isTuple(type)) + type = std::make_shared<DataTypeArray>(nested_types.back()); + } + + type_indexes.erase(TypeIndex::Tuple); + } + } + + /// If we have Map and Object(JSON) types, convert all Map types to Object(JSON). + /// If we have Map types with different value types, convert all Map types to Object(JSON) + void transformMapsAndObjectsToObjects(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + if (!type_indexes.contains(TypeIndex::Map)) + return; + + bool have_objects = type_indexes.contains(TypeIndex::Object); + bool maps_are_equal = true; + DataTypePtr first_map_type = nullptr; + for (const auto & type : data_types) + { + if (isMap(type)) + { + if (!first_map_type) + first_map_type = type; + else + maps_are_equal &= type->equals(*first_map_type); + } + } + + if (!have_objects && maps_are_equal) + return; + + for (auto & type : data_types) + { + if (isMap(type)) + type = std::make_shared<DataTypeObject>("json", true); + } + + type_indexes.erase(TypeIndex::Map); + } + + void transformMapsObjectsAndStringsToStrings(DataTypes & data_types, TypeIndexesSet & type_indexes) + { + bool have_maps = type_indexes.contains(TypeIndex::Map); + bool have_objects = type_indexes.contains(TypeIndex::Object); + bool have_strings = type_indexes.contains(TypeIndex::String); + + /// Check if we have both String and Map/Object + if (!have_strings || (!have_maps && !have_objects)) + return; + + for (auto & type : data_types) + { + if (isMap(type) || isObject(type)) + type = std::make_shared<DataTypeString>(); + } + + type_indexes.erase(TypeIndex::Map); + type_indexes.erase(TypeIndex::Object); + } + + template <bool is_json> + void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info) + { + auto transform_simple_types = [&](DataTypes & data_types, TypeIndexesSet & type_indexes) + { + /// Remove all Nothing type if possible. + transformNothingSimpleTypes(data_types, type_indexes); + + if (settings.try_infer_integers) + { + /// Transform Int64 to UInt64 if needed. + transformIntegers(data_types, type_indexes); + /// Transform integers to floats if needed. + transformIntegersAndFloatsToFloats(data_types, type_indexes); + } + + /// Transform Date to DateTime or both to String if needed. + if (settings.try_infer_dates || settings.try_infer_datetimes) + transformDatesAndDateTimes(data_types, type_indexes); + + if constexpr (!is_json) + return; + + /// Check settings specific for JSON formats. + + /// Convert numbers inferred from strings back to strings if needed. + if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings) + transformJSONNumbersBackToString(data_types, settings, type_indexes, json_info); + + /// Convert Bool to number (Int64/Float64) if needed. + if (settings.json.read_bools_as_numbers) + transformBoolsAndNumbersToNumbers(data_types, type_indexes); + }; + + auto transform_complex_types = [&](DataTypes & data_types, TypeIndexesSet & type_indexes) + { + /// Make types Nullable if needed. + transformNullableTypes(data_types, type_indexes); + + /// If we have type Nothing, it means that we had empty Array/Map while inference. + /// If there is at least one non Nothing type, change all Nothing types to it. + transformNothingComplexTypes(data_types, type_indexes); + + if constexpr (!is_json) + return; + + /// Convert JSON tuples with same nested types to arrays. + transformTuplesWithEqualNestedTypesToArrays(data_types, type_indexes); + + /// Convert JSON tuples and arrays to arrays if possible. + transformJSONTuplesAndArraysToArrays(data_types, settings, type_indexes, json_info); + + /// Convert Maps to Objects if needed. + if (settings.json.allow_object_type) + transformMapsAndObjectsToObjects(data_types, type_indexes); + + if (settings.json.read_objects_as_strings) + transformMapsObjectsAndStringsToStrings(data_types, type_indexes); + }; + + transformTypesRecursively(types, transform_simple_types, transform_complex_types); + } + + template <bool is_json> + DataTypePtr tryInferDataTypeForSingleFieldImpl(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth = 1); + + bool tryInferDate(std::string_view field) + { + if (field.empty()) + return false; + + ReadBufferFromString buf(field); + Float64 tmp_float; + /// Check if it's just a number, and if so, don't try to infer Date from it, + /// because we can interpret this number as a Date (for example 20000101 will be 2000-01-01) + /// and it will lead to inferring Date instead of simple Int64/UInt64 in some cases. + if (tryReadFloatText(tmp_float, buf) && buf.eof()) + return false; + + buf.seek(0, SEEK_SET); /// Return position to the beginning + + DayNum tmp; + return tryReadDateText(tmp, buf) && buf.eof(); + } + + bool tryInferDateTime(std::string_view field, const FormatSettings & settings) + { + if (field.empty()) + return false; + + ReadBufferFromString buf(field); + Float64 tmp_float; + /// Check if it's just a number, and if so, don't try to infer DateTime from it, + /// because we can interpret this number as a timestamp and it will lead to + /// inferring DateTime instead of simple Int64/Float64 in some cases. + if (tryReadFloatText(tmp_float, buf) && buf.eof()) + return false; + + buf.seek(0, SEEK_SET); /// Return position to the beginning + DateTime64 tmp; + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof()) + return true; + break; + case FormatSettings::DateTimeInputFormat::BestEffort: + if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof()) + return true; + break; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof()) + return true; + break; + } + + return false; + } + + template <bool is_json> + DataTypePtr tryInferArray(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth) + { + assertChar('[', buf); + skipWhitespaceIfAny(buf); + + DataTypes nested_types; + bool first = true; + bool have_invalid_nested_type = false; + while (!buf.eof() && *buf.position() != ']') + { + if (!first) + { + /// Skip field delimiter between array elements. + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto nested_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, json_info, depth + 2); + + if (nested_type) + nested_types.push_back(nested_type); + else + have_invalid_nested_type = true; + + skipWhitespaceIfAny(buf); + } + + /// No ']' at the end. + if (buf.eof()) + return nullptr; + + assertChar(']', buf); + skipWhitespaceIfAny(buf); + + /// Nested data is invalid. + if (have_invalid_nested_type) + return nullptr; + + /// Empty array has type Array(Nothing) + if (nested_types.empty()) + return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>()); + + if (checkIfTypesAreEqual(nested_types)) + return std::make_shared<DataTypeArray>(std::move(nested_types.back())); + + /// If element types are not equal, we should try to find common type. + /// If after transformation element types are still different, we return Tuple for JSON and + /// nullptr for other formats (nullptr means we couldn't infer the type). + if constexpr (is_json) + { + /// For JSON if we have not complete types, we should not try to transform them + /// and return it as a Tuple. + /// For example, if we have types [Float64, Nullable(Nothing), Float64] + /// it can be Array(Float64) or Tuple(Float64, <some_type>, Float64) and + /// we can't determine which one it is. But we will be able to do it later + /// when we will have types from other rows for this column. + /// For example, if in the next row we will have types [Nullable(Nothing), String, Float64], + /// we can determine the type for this column as Tuple(Nullable(Float64), Nullable(String), Float64). + for (const auto & type : nested_types) + { + if (!checkIfTypeIsComplete(type)) + return std::make_shared<DataTypeTuple>(nested_types); + } + + auto nested_types_copy = nested_types; + transformInferredTypesIfNeededImpl<is_json>(nested_types_copy, settings, json_info); + + if (checkIfTypesAreEqual(nested_types_copy)) + return std::make_shared<DataTypeArray>(nested_types_copy.back()); + + return std::make_shared<DataTypeTuple>(nested_types); + } + else + { + transformInferredTypesIfNeededImpl<is_json>(nested_types, settings); + if (checkIfTypesAreEqual(nested_types)) + return std::make_shared<DataTypeArray>(nested_types.back()); + + /// We couldn't determine common type for array element. + return nullptr; + } + } + + DataTypePtr tryInferTuple(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth) + { + assertChar('(', buf); + skipWhitespaceIfAny(buf); + + DataTypes nested_types; + bool first = true; + bool have_invalid_nested_type = false; + while (!buf.eof() && *buf.position() != ')') + { + if (!first) + { + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + auto nested_type = tryInferDataTypeForSingleFieldImpl<false>(buf, settings, json_info, depth + 1); + if (nested_type) + nested_types.push_back(nested_type); + else + have_invalid_nested_type = true; + + skipWhitespaceIfAny(buf); + } + + /// No ')' at the end. + if (buf.eof()) + return nullptr; + + assertChar(')', buf); + skipWhitespaceIfAny(buf); + + /// Nested data is invalid. + if (have_invalid_nested_type || nested_types.empty()) + return nullptr; + + return std::make_shared<DataTypeTuple>(nested_types); + } + + DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings) + { + if (buf.eof()) + return nullptr; + + Float64 tmp_float; + if (settings.try_infer_integers) + { + /// If we read from String, we can do it in a more efficient way. + if (auto * string_buf = dynamic_cast<ReadBufferFromString *>(&buf)) + { + /// Remember the pointer to the start of the number to rollback to it. + char * number_start = buf.position(); + Int64 tmp_int; + bool read_int = tryReadIntText(tmp_int, buf); + /// If we reached eof, it cannot be float (it requires no less data than integer) + if (buf.eof()) + return read_int ? std::make_shared<DataTypeInt64>() : nullptr; + + char * int_end = buf.position(); + /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof. + buf.position() = number_start; + + bool read_uint = false; + char * uint_end = nullptr; + /// In case of Int64 overflow we can try to infer UInt64. + if (!read_int) + { + UInt64 tmp_uint; + read_uint = tryReadIntText(tmp_uint, buf); + /// If we reached eof, it cannot be float (it requires no less data than integer) + if (buf.eof()) + return read_uint ? std::make_shared<DataTypeUInt64>() : nullptr; + + uint_end = buf.position(); + buf.position() = number_start; + } + + if (tryReadFloatText(tmp_float, buf)) + { + if (read_int && buf.position() == int_end) + return std::make_shared<DataTypeInt64>(); + if (read_uint && buf.position() == uint_end) + return std::make_shared<DataTypeUInt64>(); + return std::make_shared<DataTypeFloat64>(); + } + + return nullptr; + } + + /// We should use PeekableReadBuffer, because we need to + /// rollback to the start of number to parse it as integer first + /// and then as float. + PeekableReadBuffer peekable_buf(buf); + PeekableReadBufferCheckpoint checkpoint(peekable_buf); + Int64 tmp_int; + bool read_int = tryReadIntText(tmp_int, peekable_buf); + auto * int_end = peekable_buf.position(); + peekable_buf.rollbackToCheckpoint(true); + + bool read_uint = false; + char * uint_end = nullptr; + /// In case of Int64 overflow we can try to infer UInt64. + if (!read_int) + { + PeekableReadBufferCheckpoint new_checkpoint(peekable_buf); + UInt64 tmp_uint; + read_uint = tryReadIntText(tmp_uint, peekable_buf); + uint_end = peekable_buf.position(); + peekable_buf.rollbackToCheckpoint(true); + } + + if (tryReadFloatText(tmp_float, peekable_buf)) + { + /// Float parsing reads no fewer bytes than integer parsing, + /// so position of the buffer is either the same, or further. + /// If it's the same, then it's integer. + if (read_int && peekable_buf.position() == int_end) + return std::make_shared<DataTypeInt64>(); + if (read_uint && peekable_buf.position() == uint_end) + return std::make_shared<DataTypeUInt64>(); + return std::make_shared<DataTypeFloat64>(); + } + } + else if (tryReadFloatText(tmp_float, buf)) + { + return std::make_shared<DataTypeFloat64>(); + } + + /// This is not a number. + return nullptr; + } + + template <bool is_json> + DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info) + { + String field; + bool ok = true; + if constexpr (is_json) + ok = tryReadJSONStringInto(field, buf); + else + ok = tryReadQuotedStringInto(field, buf); + + if (!ok) + return nullptr; + + skipWhitespaceIfAny(buf); + + /// If it's object key, we should just return String type. + if constexpr (is_json) + { + if (json_info->is_object_key) + return std::make_shared<DataTypeString>(); + } + + if (auto type = tryInferDateOrDateTimeFromString(field, settings)) + return type; + + if constexpr (is_json) + { + if (settings.json.try_infer_numbers_from_strings) + { + if (auto number_type = tryInferNumberFromString(field, settings)) + { + json_info->numbers_parsed_from_json_strings.insert(number_type.get()); + return number_type; + } + } + } + + return std::make_shared<DataTypeString>(); + } + + template <bool is_json> + DataTypePtr tryInferMapOrObject(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth) + { + assertChar('{', buf); + skipWhitespaceIfAny(buf); + + DataTypes key_types; + DataTypes value_types; + bool first = true; + bool have_invalid_nested_type = false; + while (!buf.eof() && *buf.position() != '}') + { + if (!first) + { + if (!checkChar(',', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + } + else + first = false; + + DataTypePtr key_type; + if constexpr (is_json) + { + /// For JSON key type must be String. + json_info->is_object_key = true; + key_type = tryInferString<is_json>(buf, settings, json_info); + json_info->is_object_key = false; + } + else + { + key_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, nullptr, depth + 1); + } + + if (key_type) + key_types.push_back(key_type); + else + have_invalid_nested_type = true; + + skipWhitespaceIfAny(buf); + if (!checkChar(':', buf)) + return nullptr; + skipWhitespaceIfAny(buf); + + auto value_type = tryInferDataTypeForSingleFieldImpl<is_json>(buf, settings, json_info, depth + 1); + if (value_type) + value_types.push_back(value_type); + else + have_invalid_nested_type = true; + skipWhitespaceIfAny(buf); + } + + /// No '}' at the end. + if (buf.eof()) + return nullptr; + + assertChar('}', buf); + skipWhitespaceIfAny(buf); + + /// Nested data is invalid. + if (have_invalid_nested_type) + return nullptr; + + if (key_types.empty()) + { + if constexpr (is_json) + { + if (settings.json.allow_object_type) + return std::make_shared<DataTypeObject>("json", true); + } + /// Empty Map is Map(Nothing, Nothing) + return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>()); + } + + if constexpr (is_json) + { + /// If it's JSON field and one of value types is JSON Object, return also JSON Object. + for (const auto & value_type : value_types) + { + if (isObject(value_type)) + return std::make_shared<DataTypeObject>("json", true); + } + + transformInferredTypesIfNeededImpl<is_json>(value_types, settings, json_info); + if (!checkIfTypesAreEqual(value_types)) + { + if (settings.json.allow_object_type) + return std::make_shared<DataTypeObject>("json", true); + if (settings.json.read_objects_as_strings) + return std::make_shared<DataTypeString>(); + return nullptr; + } + + return std::make_shared<DataTypeMap>(key_types.back(), value_types.back()); + } + + if (!checkIfTypesAreEqual(key_types)) + transformInferredTypesIfNeededImpl<is_json>(key_types, settings); + if (!checkIfTypesAreEqual(value_types)) + transformInferredTypesIfNeededImpl<is_json>(value_types, settings); + + if (!checkIfTypesAreEqual(key_types) || !checkIfTypesAreEqual(value_types)) + return nullptr; + + auto key_type = removeNullable(key_types.back()); + if (!DataTypeMap::checkKeyType(key_type)) + return nullptr; + + return std::make_shared<DataTypeMap>(key_type, value_types.back()); + } + + template <bool is_json> + DataTypePtr tryInferDataTypeForSingleFieldImpl(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info, size_t depth) + { + if (depth > settings.max_parser_depth) + throw Exception(ErrorCodes::TOO_DEEP_RECURSION, + "Maximum parse depth ({}) exceeded. Consider rising max_parser_depth setting.", settings.max_parser_depth); + + skipWhitespaceIfAny(buf); + + if (buf.eof()) + return nullptr; + + /// Array [field1, field2, ...] + if (*buf.position() == '[') + return tryInferArray<is_json>(buf, settings, json_info, depth); + + /// Tuple (field1, field2, ...), if format is not JSON + if constexpr (!is_json) + { + if (*buf.position() == '(') + return tryInferTuple(buf, settings, json_info, depth); + } + + /// Map/Object for JSON { key1 : value1, key2 : value2, ...} + if (*buf.position() == '{') + return tryInferMapOrObject<is_json>(buf, settings, json_info, depth); + + /// String + char quote = is_json ? '"' : '\''; + if (*buf.position() == quote) + return tryInferString<is_json>(buf, settings, json_info); + + /// Bool + if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf)) + return DataTypeFactory::instance().get("Bool"); + + /// Null or NaN + if (checkCharCaseInsensitive('n', buf)) + { + if (checkStringCaseInsensitive("ull", buf)) + return makeNullable(std::make_shared<DataTypeNothing>()); + else if (checkStringCaseInsensitive("an", buf)) + return std::make_shared<DataTypeFloat64>(); + } + + /// Number + return tryInferNumber(buf, settings); + } +} + +void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings) +{ + DataTypes types = {first, second}; + transformInferredTypesIfNeededImpl<false>(types, settings, nullptr); + first = std::move(types[0]); + second = std::move(types[1]); +} + +void transformInferredJSONTypesIfNeeded( + DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info) +{ + DataTypes types = {first, second}; + transformInferredTypesIfNeededImpl<true>(types, settings, json_info); + first = std::move(types[0]); + second = std::move(types[1]); +} + +void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info) +{ + if (!data_type) + return; + + if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get())) + { + auto nested_type = array_type->getNestedType(); + transformJSONTupleToArrayIfPossible(nested_type, settings, json_info); + data_type = std::make_shared<DataTypeArray>(nested_type); + return; + } + + if (const auto * map_type = typeid_cast<const DataTypeMap *>(data_type.get())) + { + auto value_type = map_type->getValueType(); + transformJSONTupleToArrayIfPossible(value_type, settings, json_info); + data_type = std::make_shared<DataTypeMap>(map_type->getKeyType(), value_type); + return; + } + + if (const auto * tuple_type = typeid_cast<const DataTypeTuple *>(data_type.get())) + { + auto nested_types = tuple_type->getElements(); + for (auto & nested_type : nested_types) + transformJSONTupleToArrayIfPossible(nested_type, settings, json_info); + + auto nested_types_copy = nested_types; + transformInferredTypesIfNeededImpl<true>(nested_types_copy, settings, json_info); + if (checkIfTypesAreEqual(nested_types_copy)) + data_type = std::make_shared<DataTypeArray>(nested_types_copy.back()); + else + data_type = std::make_shared<DataTypeTuple>(nested_types); + + return; + } +} + +DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings) +{ + ReadBufferFromString buf(field); + + if (settings.try_infer_integers) + { + Int64 tmp_int; + if (tryReadIntText(tmp_int, buf) && buf.eof()) + return std::make_shared<DataTypeInt64>(); + + /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. + buf.position() = buf.buffer().begin(); + + /// In case of Int64 overflow, try to infer UInt64 + UInt64 tmp_uint; + if (tryReadIntText(tmp_uint, buf) && buf.eof()) + return std::make_shared<DataTypeUInt64>(); + } + + /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. + buf.position() = buf.buffer().begin(); + + Float64 tmp; + if (tryReadFloatText(tmp, buf) && buf.eof()) + return std::make_shared<DataTypeFloat64>(); + + return nullptr; +} + +DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings) +{ + if (settings.try_infer_dates && tryInferDate(field)) + return std::make_shared<DataTypeDate>(); + + if (settings.try_infer_datetimes && tryInferDateTime(field, settings)) + return std::make_shared<DataTypeDateTime64>(9); + + return nullptr; +} + +DataTypePtr tryInferDataTypeForSingleField(ReadBuffer & buf, const FormatSettings & settings) +{ + return tryInferDataTypeForSingleFieldImpl<false>(buf, settings, nullptr); +} + +DataTypePtr tryInferDataTypeForSingleField(std::string_view field, const FormatSettings & settings) +{ + ReadBufferFromString buf(field); + auto type = tryInferDataTypeForSingleFieldImpl<false>(buf, settings, nullptr); + /// Check if there is no unread data in buffer. + if (!buf.eof()) + return nullptr; + return type; +} + +DataTypePtr tryInferDataTypeForSingleJSONField(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info) +{ + return tryInferDataTypeForSingleFieldImpl<true>(buf, settings, json_info); +} + +DataTypePtr tryInferDataTypeForSingleJSONField(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info) +{ + ReadBufferFromString buf(field); + auto type = tryInferDataTypeForSingleFieldImpl<true>(buf, settings, json_info); + /// Check if there is no unread data in buffer. + if (!buf.eof()) + return nullptr; + return type; +} + +DataTypePtr makeNullableRecursively(DataTypePtr type) +{ + if (!type) + return nullptr; + + WhichDataType which(type); + + if (which.isNullable()) + return type; + + if (which.isArray()) + { + const auto * array_type = assert_cast<const DataTypeArray *>(type.get()); + auto nested_type = makeNullableRecursively(array_type->getNestedType()); + return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr; + } + + if (which.isTuple()) + { + const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get()); + DataTypes nested_types; + for (const auto & element : tuple_type->getElements()) + { + auto nested_type = makeNullableRecursively(element); + if (!nested_type) + return nullptr; + nested_types.push_back(nested_type); + } + + if (tuple_type->haveExplicitNames()) + return std::make_shared<DataTypeTuple>(std::move(nested_types), tuple_type->getElementNames()); + + return std::make_shared<DataTypeTuple>(std::move(nested_types)); + + } + + if (which.isMap()) + { + const auto * map_type = assert_cast<const DataTypeMap *>(type.get()); + auto key_type = makeNullableRecursively(map_type->getKeyType()); + auto value_type = makeNullableRecursively(map_type->getValueType()); + return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr; + } + + if (which.isLowCardinality()) + { + const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get()); + auto nested_type = makeNullableRecursively(lc_type->getDictionaryType()); + return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr; + } + + if (which.isObject()) + { + const auto * object_type = assert_cast<const DataTypeObject *>(type.get()); + if (object_type->hasNullableSubcolumns()) + return type; + return std::make_shared<DataTypeObject>(object_type->getSchemaFormat(), true); + } + + return makeNullable(type); +} + +NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header) +{ + NamesAndTypesList result; + for (auto & [name, type] : header.getNamesAndTypesList()) + result.emplace_back(name, makeNullableRecursively(type)); + return result; +} + +bool checkIfTypeIsComplete(const DataTypePtr & type) +{ + if (!type) + return false; + + WhichDataType which(type); + + if (which.isNothing()) + return false; + + if (which.isNullable()) + return checkIfTypeIsComplete(assert_cast<const DataTypeNullable *>(type.get())->getNestedType()); + + if (which.isArray()) + return checkIfTypeIsComplete(assert_cast<const DataTypeArray *>(type.get())->getNestedType()); + + if (which.isTuple()) + { + const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get()); + for (const auto & element : tuple_type->getElements()) + { + if (!checkIfTypeIsComplete(element)) + return false; + } + return true; + } + + if (which.isMap()) + { + const auto * map_type = assert_cast<const DataTypeMap *>(type.get()); + if (!checkIfTypeIsComplete(map_type->getKeyType())) + return false; + return checkIfTypeIsComplete(map_type->getValueType()); + } + + return true; +} + +} diff --git a/contrib/clickhouse/src/Formats/SchemaInferenceUtils.h b/contrib/clickhouse/src/Formats/SchemaInferenceUtils.h new file mode 100644 index 0000000000..b511abf6a7 --- /dev/null +++ b/contrib/clickhouse/src/Formats/SchemaInferenceUtils.h @@ -0,0 +1,93 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <IO/ReadBuffer.h> + +namespace DB +{ + +/// Struct with some additional information about inferred types for JSON formats. +struct JSONInferenceInfo +{ + /// We store numbers that were parsed from strings. + /// It's used in types transformation to change such numbers back to string if needed. + std::unordered_set<const IDataType *> numbers_parsed_from_json_strings; + /// Indicates if currently we are inferring type for Map/Object key. + bool is_object_key = false; +}; + +/// Try to determine datatype of the value in buffer/string. If the type cannot be inferred, return nullptr. +/// In general, it tries to parse a type using the following logic: +/// If we see '[', we try to parse an array of values and recursively determine datatype for each element. +/// If we see '(', we try to parse a tuple of values and recursively determine datatype for each element. +/// If we see '{', we try to parse a Map of keys and values and recursively determine datatype for each key/value. +/// If we see a quote '\'', we treat it as a string and read until next quote. +/// If we see NULL it returns Nullable(Nothing) +/// Otherwise we try to read a number. +DataTypePtr tryInferDataTypeForSingleField(ReadBuffer & buf, const FormatSettings & settings); +DataTypePtr tryInferDataTypeForSingleField(std::string_view field, const FormatSettings & settings); + +/// The same as tryInferDataTypeForSingleField, but for JSON values. +DataTypePtr tryInferDataTypeForSingleJSONField(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info); +DataTypePtr tryInferDataTypeForSingleJSONField(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info); + +/// Try to parse Date or DateTime value from a string. +DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings); + +/// Try to parse a number value from a string. By default, it tries to parse Float64, +/// but if setting try_infer_integers is enabled, it also tries to parse Int64. +DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings); + +/// It takes two types inferred for the same column and tries to transform them to a common type if possible. +/// It's also used when we try to infer some not ordinary types from another types. +/// Example 1: +/// Dates inferred from strings. In this case we should check if dates were inferred from all strings +/// in the same way and if not, transform inferred dates back to strings. +/// For example, when we have Array(Date) (like `['2020-01-01', '2020-02-02']`) and Array(String) (like `['string', 'abc']` +/// we will convert the first type to Array(String). +/// Example 2: +/// When we have integers and floats for the same value, we should convert all integers to floats. +/// For example, when we have Array(Int64) (like `[123, 456]`) and Array(Float64) (like `[42.42, 4.42]`) +/// we will convert the first type to Array(Float64) +/// Example 3: +/// When we have not complete types like Nullable(Nothing), Array(Nullable(Nothing)) or Tuple(UInt64, Nullable(Nothing)), +/// we try to complete them using the other type. +/// For example, if we have Tuple(UInt64, Nullable(Nothing)) and Tuple(Nullable(Nothing), String) we will convert both +/// types to common type Tuple(Nullable(UInt64), Nullable(String)) +void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings); + +/// The same as transformInferredTypesIfNeeded but uses some specific transformations for JSON. +/// Example 1: +/// When we have numbers inferred from strings and strings, we convert all such numbers back to string. +/// For example, if we have Array(Int64) (like `['123', '456']`) and Array(String) (like `['str', 'abc']`) +/// we will convert the first type to Array(String). Note that we collect information about numbers inferred +/// from strings in json_info while inference and use it here, so we will know that Array(Int64) contains +/// integer inferred from a string. +/// Example 2: +/// When we have maps with different value types, we convert all types to JSON object type. +/// For example, if we have Map(String, UInt64) (like `{"a" : 123}`) and Map(String, String) (like `{"b" : 'abc'}`) +/// we will convert both types to Object('JSON'). +void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info); + +/// Check if type is Tuple(...), try to transform nested types to find a common type for them and if all nested types +/// are the same after transform, we convert this tuple to an Array with common nested type. +/// For example, if we have Tuple(String, Nullable(Nothing)) we will convert it to Array(String). +/// It's used when all rows were read and we have Tuple in the result type that can be actually an Array. +void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info); + +/// Make type Nullable recursively: +/// - Type -> Nullable(type) +/// - Array(Type) -> Array(Nullable(Type)) +/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN)) +/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType)) +/// - LowCardinality(Type) -> LowCardinality(Nullable(Type)) +DataTypePtr makeNullableRecursively(DataTypePtr type); + +/// Call makeNullableRecursively for all types +/// in the block and return names and types. +NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header); + +/// Check if type contains Nothing, like Array(Tuple(Nullable(Nothing), String)) +bool checkIfTypeIsComplete(const DataTypePtr & type); + +} diff --git a/contrib/clickhouse/src/Formats/StructureToCapnProtoSchema.cpp b/contrib/clickhouse/src/Formats/StructureToCapnProtoSchema.cpp new file mode 100644 index 0000000000..9f4d96b7c8 --- /dev/null +++ b/contrib/clickhouse/src/Formats/StructureToCapnProtoSchema.cpp @@ -0,0 +1,236 @@ +#include <Formats/StructureToCapnProtoSchema.h> +#include <Formats/StructureToFormatSchemaUtils.h> +#include <Columns/ColumnString.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeEnum.h> +#include <Common/StringUtils/StringUtils.h> +#include <Common/randomSeed.h> +#include <pcg_random.hpp> + + +namespace DB +{ + +using namespace StructureToFormatSchemaUtils; + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +const std::unordered_map<TypeIndex, String> capn_proto_simple_type_names = +{ + {TypeIndex::Int8, "Int8"}, + {TypeIndex::UInt8, "UInt8"}, + {TypeIndex::Int16, "Int16"}, + {TypeIndex::UInt16, "UInt16"}, + {TypeIndex::Int32, "Int32"}, + {TypeIndex::UInt32, "UInt32"}, + {TypeIndex::Int64, "Int64"}, + {TypeIndex::UInt64, "UInt64"}, + {TypeIndex::Int128, "Data"}, + {TypeIndex::UInt128, "Data"}, + {TypeIndex::Int256, "Data"}, + {TypeIndex::UInt256, "Data"}, + {TypeIndex::Float32, "Float32"}, + {TypeIndex::Float64, "Float64"}, + {TypeIndex::Decimal32, "Int32"}, + {TypeIndex::Decimal64, "Int64"}, + {TypeIndex::Decimal128, "Data"}, + {TypeIndex::Decimal256, "Data"}, + {TypeIndex::String, "Data"}, + {TypeIndex::FixedString, "Data"}, + {TypeIndex::UUID, "Data"}, + {TypeIndex::Date, "UInt16"}, + {TypeIndex::Date32, "Int32"}, + {TypeIndex::DateTime, "UInt32"}, + {TypeIndex::DateTime64, "Int64"}, + {TypeIndex::IPv4, "UInt32"}, + {TypeIndex::IPv6, "Data"}, +}; + +void writeCapnProtoHeader(WriteBuffer & buf) +{ + pcg64 rng(randomSeed()); + size_t id = rng() | (1ull << 63); /// First bit should be 1 + writeString(fmt::format("@0x{};\n\n", getHexUIntLowercase(id)), buf); +} + +void writeFieldDefinition(WriteBuffer & buf, const String & type_name, const String & column_name, size_t & field_index, size_t indent) +{ + writeIndent(buf, indent); + writeString(fmt::format("{} @{} : {};\n", getSchemaFieldName(column_name), field_index++, type_name), buf); +} + +void startEnum(WriteBuffer & buf, const String & enum_name, size_t indent) +{ + startNested(buf, enum_name, "enum", indent); +} + +void startUnion(WriteBuffer & buf, size_t indent) +{ + startNested(buf, "", "union", indent); +} + +void startStruct(WriteBuffer & buf, const String & struct_name, size_t indent) +{ + startNested(buf, struct_name, "struct", indent); +} + +String prepareAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent); + +void writeField(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t & field_index, size_t indent) +{ + auto field_type_name = prepareAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + writeFieldDefinition(buf, field_type_name, column_name, field_index, indent); +} + +String prepareArrayAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & nested_type = assert_cast<const DataTypeArray &>(*data_type).getNestedType(); + auto nested_type_name = prepareAndGetCapnProtoTypeName(buf, nested_type, column_name, indent); + return "List(" + nested_type_name + ")"; +} + +String prepareNullableAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + /// Nullable is represented as a struct with union with 2 fields: + /// + /// struct Nullable + /// { + /// union + /// { + /// value @0 : Value; + /// null @1 : Void; + /// } + /// } + auto struct_name = getSchemaMessageName(column_name); + startStruct(buf, struct_name, indent); + auto nested_type_name = prepareAndGetCapnProtoTypeName(buf, assert_cast<const DataTypeNullable &>(*data_type).getNestedType(), column_name, indent); + startUnion(buf, indent + 1); + size_t field_index = 0; + writeFieldDefinition(buf, nested_type_name, "value", field_index, indent + 2); + writeFieldDefinition(buf, "Void", "null", field_index, indent + 2); + endNested(buf, indent + 1); + endNested(buf, indent); + return struct_name; +} + +String prepareTupleAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & tuple_type = assert_cast<const DataTypeTuple &>(*data_type); + auto nested_names_and_types = getCollectedTupleElements(tuple_type); + + String struct_name = getSchemaMessageName(column_name); + startStruct(buf, struct_name, indent); + size_t nested_field_index = 0; + for (const auto & [name, type] : nested_names_and_types) + writeField(buf, type, name, nested_field_index, indent + 1); + endNested(buf, indent); + return struct_name; +} + +String prepareMapAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + /// We output/input Map type as follow CapnProto schema + /// + /// struct Map + /// { + /// struct Entry + /// { + /// key @0: Key; + /// value @1: Value; + /// } + /// entries @0 :List(Entry); + /// } + const auto & map_type = assert_cast<const DataTypeMap &>(*data_type); + const auto & key_type = map_type.getKeyType(); + const auto & value_type = map_type.getValueType(); + + String struct_name = getSchemaMessageName(column_name); + startStruct(buf, struct_name, indent); + startStruct(buf, "Entry", indent + 1); + auto key_type_name = prepareAndGetCapnProtoTypeName(buf, key_type, "key", indent + 2); + auto value_type_name = prepareAndGetCapnProtoTypeName(buf, value_type, "value", indent + 2); + size_t field_index = 0; + writeFieldDefinition(buf, key_type_name, "key", field_index, indent + 2); + writeFieldDefinition(buf, value_type_name, "value", field_index, indent + 2); + endNested(buf, indent + 1); + field_index = 0; + writeFieldDefinition(buf, "List(Entry)", "entries", field_index, indent + 1); + endNested(buf, indent); + return struct_name; +} + +template <typename EnumType> +String prepareEnumAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & enum_type = assert_cast<const DataTypeEnum<EnumType> &>(*data_type); + String enum_name = getSchemaMessageName(column_name); + startEnum(buf, enum_name, indent); + const auto & names = enum_type.getAllRegisteredNames(); + for (size_t i = 0; i != names.size(); ++i) + { + writeIndent(buf, indent + 1); + writeString(fmt::format("{} @{};\n", names[i], std::to_string(i)), buf); + } + endNested(buf, indent); + return enum_name; +} + +String prepareAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + TypeIndex type_id = data_type->getTypeId(); + + switch (data_type->getTypeId()) + { + case TypeIndex::Nullable: + return prepareNullableAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::LowCardinality: + return prepareAndGetCapnProtoTypeName(buf, assert_cast<const DataTypeLowCardinality &>(*data_type).getDictionaryType(), column_name, indent); + case TypeIndex::Array: + return prepareArrayAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::Tuple: + return prepareTupleAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::Map: + return prepareMapAndGetCapnProtoTypeName(buf, data_type, column_name, indent); + case TypeIndex::Enum8: + return prepareEnumAndGetCapnProtoTypeName<Int8>(buf, data_type, column_name, indent); + case TypeIndex::Enum16: + return prepareEnumAndGetCapnProtoTypeName<Int16>(buf, data_type, column_name, indent); + default: + { + if (isBool(data_type)) + return "Bool"; + + auto it = capn_proto_simple_type_names.find(type_id); + if (it == capn_proto_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "CapnProto type name is not found for type {}", data_type->getName()); + return it->second; + } + } +} + +} + +void StructureToCapnProtoSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) +{ + auto names_and_types = collectNested(names_and_types_); + writeCapnProtoHeader(buf); + startStruct(buf, getSchemaMessageName(message_name), 0); + + size_t field_index = 0; + for (const auto & [column_name, data_type] : names_and_types) + writeField(buf, data_type, column_name, field_index, 1); + + endNested(buf, 0); +} + +} diff --git a/contrib/clickhouse/src/Formats/StructureToCapnProtoSchema.h b/contrib/clickhouse/src/Formats/StructureToCapnProtoSchema.h new file mode 100644 index 0000000000..b2a0a8a8cf --- /dev/null +++ b/contrib/clickhouse/src/Formats/StructureToCapnProtoSchema.h @@ -0,0 +1,16 @@ +#pragma once + +#include <IO/WriteBuffer.h> +#include <Core/NamesAndTypes.h> + +namespace DB +{ + +struct StructureToCapnProtoSchema +{ + static constexpr auto name = "structureToCapnProtoSchema"; + + static void writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_); +}; + +} diff --git a/contrib/clickhouse/src/Formats/StructureToFormatSchemaUtils.cpp b/contrib/clickhouse/src/Formats/StructureToFormatSchemaUtils.cpp new file mode 100644 index 0000000000..a9374647eb --- /dev/null +++ b/contrib/clickhouse/src/Formats/StructureToFormatSchemaUtils.cpp @@ -0,0 +1,117 @@ +#include <Formats/StructureToFormatSchemaUtils.h> +#include <IO/WriteHelpers.h> + +namespace DB +{ + +namespace StructureToFormatSchemaUtils +{ + +void writeIndent(WriteBuffer & buf, size_t indent) +{ + writeChar(' ', indent * 4, buf); +} + +void startNested(WriteBuffer & buf, const String & nested_name, const String & nested_type, size_t indent) +{ + writeIndent(buf, indent); + writeString(nested_type, buf); + if (!nested_name.empty()) + { + writeChar(' ', buf); + writeString(nested_name, buf); + } + writeChar('\n', buf); + writeIndent(buf, indent); + writeCString("{\n", buf); +} + +void endNested(WriteBuffer & buf, size_t indent) +{ + writeIndent(buf, indent); + writeCString("}\n", buf); +} + +String getSchemaFieldName(const String & column_name) +{ + String result = column_name; + /// Replace all first uppercase letters to lower-case, + /// because fields in CapnProto schema must begin with a lower-case letter. + /// Don't replace all letters to lower-case to remain camelCase field names. + for (auto & symbol : result) + { + if (islower(symbol)) + break; + symbol = tolower(symbol); + } + return result; +} + +String getSchemaMessageName(const String & column_name) +{ + String result = column_name; + if (!column_name.empty() && isalpha(column_name[0])) + result[0] = toupper(column_name[0]); + return result; +} + +namespace +{ + std::pair<String, String> splitName(const String & name) + { + const auto * begin = name.data(); + const auto * end = name.data() + name.size(); + const auto * it = find_first_symbols<'_', '.'>(begin, end); + String first = String(begin, it); + String second = it == end ? "" : String(it + 1, end); + return {std::move(first), std::move(second)}; + } +} + +NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types) +{ + /// Find all columns with dots '.' or underscores '_' and move them into a tuple. + /// For example if we have columns 'a.b UInt32, a.c UInt32, x_y String' we will + /// change it to 'a Tuple(b UInt32, c UInt32), x Tuple(y String)' + NamesAndTypesList result; + std::unordered_map<String, NamesAndTypesList> nested; + for (const auto & [name, type] : names_and_types) + { + auto [field_name, nested_name] = splitName(name); + if (nested_name.empty()) + result.emplace_back(name, type); + else + nested[field_name].emplace_back(nested_name, type); + } + + for (const auto & [field_name, elements]: nested) + result.emplace_back(field_name, std::make_shared<DataTypeTuple>(elements.getTypes(), elements.getNames())); + + return result; +} + +NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type) +{ + const auto & nested_types = tuple_type.getElements(); + Names nested_names; + if (tuple_type.haveExplicitNames()) + { + nested_names = tuple_type.getElementNames(); + } + else + { + nested_names.reserve(nested_types.size()); + for (size_t i = 0; i != nested_types.size(); ++i) + nested_names.push_back("e" + std::to_string(i + 1)); + } + + NamesAndTypesList result; + for (size_t i = 0; i != nested_names.size(); ++i) + result.emplace_back(nested_names[i], nested_types[i]); + + return collectNested(result); +} + +} + +} diff --git a/contrib/clickhouse/src/Formats/StructureToFormatSchemaUtils.h b/contrib/clickhouse/src/Formats/StructureToFormatSchemaUtils.h new file mode 100644 index 0000000000..c6b86501ac --- /dev/null +++ b/contrib/clickhouse/src/Formats/StructureToFormatSchemaUtils.h @@ -0,0 +1,27 @@ +#pragma once + +#include <Core/NamesAndTypes.h> +#include <DataTypes/NestedUtils.h> +#include <DataTypes/DataTypeTuple.h> + +namespace DB +{ + +namespace StructureToFormatSchemaUtils +{ + void writeIndent(WriteBuffer & buf, size_t indent); + + void startNested(WriteBuffer & buf, const String & nested_name, const String & nested_type, size_t indent); + + void endNested(WriteBuffer & buf, size_t indent); + + String getSchemaFieldName(const String & column_name); + + String getSchemaMessageName(const String & column_name); + + NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types); + + NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type); +} + +} diff --git a/contrib/clickhouse/src/Formats/StructureToProtobufSchema.cpp b/contrib/clickhouse/src/Formats/StructureToProtobufSchema.cpp new file mode 100644 index 0000000000..4a704e8d42 --- /dev/null +++ b/contrib/clickhouse/src/Formats/StructureToProtobufSchema.cpp @@ -0,0 +1,214 @@ +#include <Formats/StructureToProtobufSchema.h> +#include <Formats/StructureToFormatSchemaUtils.h> +#include <Columns/ColumnString.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeEnum.h> +#include <Common/StringUtils/StringUtils.h> + +namespace DB +{ + +using namespace StructureToFormatSchemaUtils; + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +const std::unordered_map<TypeIndex, String> protobuf_simple_type_names = +{ + {TypeIndex::Int8, "int32"}, + {TypeIndex::UInt8, "uint32"}, + {TypeIndex::Int16, "int32"}, + {TypeIndex::UInt16, "uint32"}, + {TypeIndex::Int32, "int32"}, + {TypeIndex::UInt32, "uint32"}, + {TypeIndex::Int64, "int64"}, + {TypeIndex::UInt64, "uint64"}, + {TypeIndex::Int128, "bytes"}, + {TypeIndex::UInt128, "bytes"}, + {TypeIndex::Int256, "bytes"}, + {TypeIndex::UInt256, "bytes"}, + {TypeIndex::Float32, "float"}, + {TypeIndex::Float64, "double"}, + {TypeIndex::Decimal32, "bytes"}, + {TypeIndex::Decimal64, "bytes"}, + {TypeIndex::Decimal128, "bytes"}, + {TypeIndex::Decimal256, "bytes"}, + {TypeIndex::String, "bytes"}, + {TypeIndex::FixedString, "bytes"}, + {TypeIndex::UUID, "bytes"}, + {TypeIndex::Date, "uint32"}, + {TypeIndex::Date32, "int32"}, + {TypeIndex::DateTime, "uint32"}, + {TypeIndex::DateTime64, "uint64"}, + {TypeIndex::IPv4, "uint32"}, + {TypeIndex::IPv6, "bytes"}, +}; + +void writeProtobufHeader(WriteBuffer & buf) +{ + writeCString("syntax = \"proto3\";\n\n", buf); +} + +void startEnum(WriteBuffer & buf, const String & enum_name, size_t indent) +{ + startNested(buf, enum_name, "enum", indent); +} + +void startMessage(WriteBuffer & buf, const String & message_name, size_t indent) +{ + startNested(buf, message_name, "message", indent); +} + +void writeFieldDefinition(WriteBuffer & buf, const String & type_name, const String & column_name, size_t & field_index, size_t indent) +{ + writeIndent(buf, indent); + writeString(fmt::format("{} {} = {};\n", type_name, getSchemaFieldName(column_name), field_index++), buf); +} + +String prepareAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent); + +void writeProtobufField(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t & field_index, size_t indent) +{ + auto field_type_name = prepareAndGetProtobufTypeName(buf, data_type, column_name, indent); + writeFieldDefinition(buf, field_type_name, column_name, field_index, indent); +} + +String prepareArrayAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & nested_type = assert_cast<const DataTypeArray &>(*data_type).getNestedType(); + /// Simple case when we can just use 'repeated <nested_type>'. + if (!isArray(nested_type) && !isMap(nested_type)) + { + auto nested_type_name = prepareAndGetProtobufTypeName(buf, nested_type, column_name, indent); + return "repeated " + nested_type_name; + } + + /// Protobuf doesn't support multidimensional repeated fields and repeated maps. + /// When we have Array(Array(...)) or Array(Map(...)) we should place nested type into a nested Message with one field. + String message_name = getSchemaMessageName(column_name); + startMessage(buf, message_name, indent); + size_t nested_field_index = 1; + writeProtobufField(buf, nested_type, column_name, nested_field_index, indent + 1); + endNested(buf, indent); + return "repeated " + message_name; +} + +String prepareTupleAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & tuple_type = assert_cast<const DataTypeTuple &>(*data_type); + auto nested_names_and_types = getCollectedTupleElements(tuple_type); + + String message_name = getSchemaMessageName(column_name); + startMessage(buf, message_name, indent); + size_t nested_field_index = 1; + for (const auto & [name, type] : nested_names_and_types) + writeProtobufField(buf, type, name, nested_field_index, indent + 1); + endNested(buf, indent); + return message_name; +} + +String prepareMapAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & map_type = assert_cast<const DataTypeMap &>(*data_type); + const auto & key_type = map_type.getKeyType(); + const auto & value_type = map_type.getValueType(); + auto it = protobuf_simple_type_names.find(key_type->getTypeId()); + if (it == protobuf_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type {} is not supported for conversion into Map key in Protobuf schema", data_type->getName()); + auto key_type_name = it->second; + /// Protobuf map type doesn't support "bytes" type as a key. Change it to "string" + if (key_type_name == "bytes") + key_type_name = "string"; + + /// Special cases when value type is Array or Map, because Protobuf + /// doesn't support syntax "map<Key, repeated Value>" and "map<Key, map<..., ...>>" + /// In this case we should place it into a nested Message with one field. + String value_type_name; + if (isArray(value_type) || isMap(value_type)) + { + value_type_name = getSchemaMessageName(column_name) + "Value"; + startMessage(buf, value_type_name, indent); + size_t nested_field_index = 1; + writeProtobufField(buf, value_type, column_name + "Value", nested_field_index, indent + 1); + endNested(buf, indent); + } + else + { + value_type_name = prepareAndGetProtobufTypeName(buf, value_type, column_name + "Value", indent); + } + + return fmt::format("map<{}, {}>", key_type_name, value_type_name); +} + +template <typename EnumType> +String prepareEnumAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + const auto & enum_type = assert_cast<const DataTypeEnum<EnumType> &>(*data_type); + String enum_name = getSchemaMessageName(column_name); + startEnum(buf, enum_name, indent); + const auto & names = enum_type.getAllRegisteredNames(); + for (size_t i = 0; i != names.size(); ++i) + { + writeIndent(buf, indent + 1); + writeString(fmt::format("{} = {};\n", names[i], std::to_string(i)), buf); + } + endNested(buf, indent); + return enum_name; +} + +String prepareAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) +{ + TypeIndex type_id = data_type->getTypeId(); + + switch (data_type->getTypeId()) + { + case TypeIndex::Nullable: + return prepareAndGetProtobufTypeName(buf, assert_cast<const DataTypeNullable &>(*data_type).getNestedType(), column_name, indent); + case TypeIndex::LowCardinality: + return prepareAndGetProtobufTypeName(buf, assert_cast<const DataTypeLowCardinality &>(*data_type).getDictionaryType(), column_name, indent); + case TypeIndex::Array: + return prepareArrayAndGetProtobufTypeName(buf, data_type, column_name, indent); + case TypeIndex::Tuple: + return prepareTupleAndGetProtobufTypeName(buf, data_type, column_name, indent); + case TypeIndex::Map: + return prepareMapAndGetProtobufTypeName(buf, data_type, column_name, indent); + case TypeIndex::Enum8: + return prepareEnumAndGetProtobufTypeName<Int8>(buf, data_type, column_name, indent); + case TypeIndex::Enum16: + return prepareEnumAndGetProtobufTypeName<Int16>(buf, data_type, column_name, indent); + default: + { + if (isBool(data_type)) + return "bool"; + + auto it = protobuf_simple_type_names.find(type_id); + if (it == protobuf_simple_type_names.end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type {} is not supported for conversion into Protobuf schema", data_type->getName()); + return it->second; + } + } +} + +} + +void StructureToProtobufSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) +{ + auto names_and_types = collectNested(names_and_types_); + writeProtobufHeader(buf); + startMessage(buf, getSchemaMessageName(message_name), 0); + size_t field_index = 1; + for (const auto & [column_name, data_type] : names_and_types) + writeProtobufField(buf, data_type, column_name, field_index, 1); + endNested(buf, 0); +} + +} diff --git a/contrib/clickhouse/src/Formats/StructureToProtobufSchema.h b/contrib/clickhouse/src/Formats/StructureToProtobufSchema.h new file mode 100644 index 0000000000..f4dfb0ae0c --- /dev/null +++ b/contrib/clickhouse/src/Formats/StructureToProtobufSchema.h @@ -0,0 +1,16 @@ +#pragma once + +#include <IO/WriteBuffer.h> +#include <Core/NamesAndTypes.h> + +namespace DB +{ + +struct StructureToProtobufSchema +{ + static constexpr auto name = "structureToProtobufSchema"; + + static void writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_); +}; + +} diff --git a/contrib/clickhouse/src/Formats/TemporaryFileStreamLegacy.cpp b/contrib/clickhouse/src/Formats/TemporaryFileStreamLegacy.cpp new file mode 100644 index 0000000000..e6651f0e83 --- /dev/null +++ b/contrib/clickhouse/src/Formats/TemporaryFileStreamLegacy.cpp @@ -0,0 +1,45 @@ +#include <Formats/TemporaryFileStreamLegacy.h> +#include <Formats/NativeReader.h> +#include <Formats/NativeWriter.h> +#include <Processors/Executors/PullingPipelineExecutor.h> +#include <Processors/ISource.h> +#include <Compression/CompressedWriteBuffer.h> +#include <IO/WriteBufferFromFile.h> +#include <Core/ProtocolDefines.h> + + +namespace DB +{ + +/// To read the data that was flushed into the temporary data file. +TemporaryFileStreamLegacy::TemporaryFileStreamLegacy(const std::string & path) + : file_in(path) + , compressed_in(file_in) + , block_in(std::make_unique<NativeReader>(compressed_in, DBMS_TCP_PROTOCOL_VERSION)) +{} + +TemporaryFileStreamLegacy::TemporaryFileStreamLegacy(const std::string & path, const Block & header_) + : file_in(path) + , compressed_in(file_in) + , block_in(std::make_unique<NativeReader>(compressed_in, header_, 0)) +{} + +/// Flush data from input stream into file for future reading +TemporaryFileStreamLegacy::Stat TemporaryFileStreamLegacy::write(const std::string & path, const Block & header, QueryPipelineBuilder builder, const std::string & codec) +{ + WriteBufferFromFile file_buf(path); + CompressedWriteBuffer compressed_buf(file_buf, CompressionCodecFactory::instance().get(codec, {})); + NativeWriter output(compressed_buf, 0, header); + + auto pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + PullingPipelineExecutor executor(pipeline); + + Block block; + while (executor.pull(block)) + output.write(block); + + compressed_buf.finalize(); + return Stat{compressed_buf.getCompressedBytes(), compressed_buf.getUncompressedBytes()}; +} + +} diff --git a/contrib/clickhouse/src/Formats/TemporaryFileStreamLegacy.h b/contrib/clickhouse/src/Formats/TemporaryFileStreamLegacy.h new file mode 100644 index 0000000000..90e318c970 --- /dev/null +++ b/contrib/clickhouse/src/Formats/TemporaryFileStreamLegacy.h @@ -0,0 +1,34 @@ +#pragma once + +#include <Processors/ISource.h> +#include <QueryPipeline/QueryPipelineBuilder.h> +#include <Compression/CompressedReadBuffer.h> +#include <IO/ReadBufferFromFile.h> +#include <Formats/NativeReader.h> + +namespace DB +{ + +/// Used only in MergeJoin +/// TODO: use `TemporaryDataOnDisk` instead +/// To read the data that was flushed into the temporary data file. +struct TemporaryFileStreamLegacy +{ + struct Stat + { + size_t compressed_bytes = 0; + size_t uncompressed_bytes = 0; + }; + + ReadBufferFromFile file_in; + CompressedReadBuffer compressed_in; + std::unique_ptr<NativeReader> block_in; + + explicit TemporaryFileStreamLegacy(const std::string & path); + TemporaryFileStreamLegacy(const std::string & path, const Block & header_); + + /// Flush data from input stream into file for future reading + static Stat write(const std::string & path, const Block & header, QueryPipelineBuilder builder, const std::string & codec); +}; + +} diff --git a/contrib/clickhouse/src/Formats/formatBlock.cpp b/contrib/clickhouse/src/Formats/formatBlock.cpp new file mode 100644 index 0000000000..d2b401207a --- /dev/null +++ b/contrib/clickhouse/src/Formats/formatBlock.cpp @@ -0,0 +1,21 @@ +#include <Core/Block.h> +#include <Formats/formatBlock.h> +#include <Processors/Formats/IOutputFormat.h> +#include <Processors/Sources/SourceFromSingleChunk.h> +#include <QueryPipeline/QueryPipeline.h> +#include <Processors/Executors/CompletedPipelineExecutor.h> + +namespace DB +{ + +void formatBlock(OutputFormatPtr out, const Block & block) +{ + auto source = std::make_shared<SourceFromSingleChunk>(block); + QueryPipeline pipeline(source); + pipeline.complete(out); + CompletedPipelineExecutor executor(pipeline); + executor.execute(); + out->flush(); +} + +} diff --git a/contrib/clickhouse/src/Formats/formatBlock.h b/contrib/clickhouse/src/Formats/formatBlock.h new file mode 100644 index 0000000000..fd206a23a4 --- /dev/null +++ b/contrib/clickhouse/src/Formats/formatBlock.h @@ -0,0 +1,14 @@ +#pragma once +#include <memory> + +namespace DB +{ + +class Block; + +class IOutputFormat; +using OutputFormatPtr = std::shared_ptr<IOutputFormat>; + +void formatBlock(OutputFormatPtr out, const Block & block); + +} diff --git a/contrib/clickhouse/src/Formats/insertNullAsDefaultIfNeeded.cpp b/contrib/clickhouse/src/Formats/insertNullAsDefaultIfNeeded.cpp new file mode 100644 index 0000000000..767892718c --- /dev/null +++ b/contrib/clickhouse/src/Formats/insertNullAsDefaultIfNeeded.cpp @@ -0,0 +1,37 @@ +#include <Formats/insertNullAsDefaultIfNeeded.h> +#include <Columns/ColumnNullable.h> +#include <Columns/ColumnLowCardinality.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeLowCardinality.h> + +namespace DB +{ + +void insertNullAsDefaultIfNeeded(ColumnWithTypeAndName & input_column, const ColumnWithTypeAndName & header_column, size_t column_i, BlockMissingValues * block_missing_values) +{ + if (!isNullableOrLowCardinalityNullable(input_column.type) || isNullableOrLowCardinalityNullable(header_column.type)) + return; + + if (block_missing_values) + { + for (size_t i = 0; i < input_column.column->size(); ++i) + { + if (input_column.column->isNullAt(i)) + block_missing_values->setBit(column_i, i); + } + } + + if (input_column.type->isNullable()) + { + input_column.column = assert_cast<const ColumnNullable *>(input_column.column.get())->getNestedColumnWithDefaultOnNull(); + input_column.type = removeNullable(input_column.type); + } + else + { + input_column.column = assert_cast<const ColumnLowCardinality *>(input_column.column.get())->cloneWithDefaultOnNull(); + const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(input_column.type.get()); + input_column.type = std::make_shared<DataTypeLowCardinality>(removeNullable(lc_type->getDictionaryType())); + } +} + +} diff --git a/contrib/clickhouse/src/Formats/insertNullAsDefaultIfNeeded.h b/contrib/clickhouse/src/Formats/insertNullAsDefaultIfNeeded.h new file mode 100644 index 0000000000..3e4dcd1e74 --- /dev/null +++ b/contrib/clickhouse/src/Formats/insertNullAsDefaultIfNeeded.h @@ -0,0 +1,10 @@ +#pragma once + +#include <Core/Block.h> + +namespace DB +{ + +void insertNullAsDefaultIfNeeded(ColumnWithTypeAndName & input_column, const ColumnWithTypeAndName & header_column, size_t column_i, BlockMissingValues * block_missing_values); + +} diff --git a/contrib/clickhouse/src/Formats/registerFormats.cpp b/contrib/clickhouse/src/Formats/registerFormats.cpp new file mode 100644 index 0000000000..5e91f433fe --- /dev/null +++ b/contrib/clickhouse/src/Formats/registerFormats.cpp @@ -0,0 +1,289 @@ +#include "clickhouse_config.h" + +#include <Formats/FormatFactory.h> + + +namespace DB +{ + +/// File Segmentation Engines for parallel reading + +void registerFileSegmentationEngineTabSeparated(FormatFactory & factory); +void registerFileSegmentationEngineCSV(FormatFactory & factory); +void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory); +void registerFileSegmentationEngineRegexp(FormatFactory & factory); +void registerFileSegmentationEngineJSONAsString(FormatFactory & factory); +void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory); +void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory); +#if USE_HIVE +void registerFileSegmentationEngineHiveText(FormatFactory & factory); +#endif +void registerFileSegmentationEngineLineAsString(FormatFactory & factory); +void registerFileSegmentationEngineBSONEachRow(FormatFactory & factory); + +/// Formats for both input/output. + +void registerInputFormatNative(FormatFactory & factory); +void registerOutputFormatNative(FormatFactory & factory); + +void registerInputFormatRowBinary(FormatFactory & factory); +void registerOutputFormatRowBinary(FormatFactory & factory); +void registerInputFormatTabSeparated(FormatFactory & factory); +void registerOutputFormatTabSeparated(FormatFactory & factory); +void registerInputFormatValues(FormatFactory & factory); +void registerOutputFormatValues(FormatFactory & factory); +void registerInputFormatCSV(FormatFactory & factory); +void registerOutputFormatCSV(FormatFactory & factory); +void registerInputFormatTSKV(FormatFactory & factory); +void registerOutputFormatTSKV(FormatFactory & factory); +void registerOutputFormatJSON(FormatFactory & factory); +void registerInputFormatJSON(FormatFactory & factory); +void registerOutputFormatJSONCompact(FormatFactory & factory); +void registerInputFormatJSONCompact(FormatFactory & factory); +void registerInputFormatJSONEachRow(FormatFactory & factory); +void registerOutputFormatJSONEachRow(FormatFactory & factory); +void registerInputFormatJSONObjectEachRow(FormatFactory & factory); +void registerOutputFormatJSONObjectEachRow(FormatFactory & factory); +void registerInputFormatJSONCompactEachRow(FormatFactory & factory); +void registerOutputFormatJSONCompactEachRow(FormatFactory & factory); +void registerInputFormatJSONColumns(FormatFactory & factory); +void registerOutputFormatJSONColumns(FormatFactory & factory); +void registerInputFormatJSONCompactColumns(FormatFactory & factory); +void registerOutputFormatJSONCompactColumns(FormatFactory & factory); +void registerInputFormatBSONEachRow(FormatFactory & factory); +void registerOutputFormatBSONEachRow(FormatFactory & factory); +void registerInputFormatJSONColumnsWithMetadata(FormatFactory & factory); +void registerOutputFormatJSONColumnsWithMetadata(FormatFactory & factory); +void registerInputFormatProtobuf(FormatFactory & factory); +void registerOutputFormatProtobuf(FormatFactory & factory); +void registerInputFormatProtobufList(FormatFactory & factory); +void registerOutputFormatProtobufList(FormatFactory & factory); +void registerInputFormatTemplate(FormatFactory & factory); +void registerOutputFormatTemplate(FormatFactory & factory); +void registerInputFormatMsgPack(FormatFactory & factory); +void registerOutputFormatMsgPack(FormatFactory & factory); +void registerInputFormatORC(FormatFactory & factory); +void registerOutputFormatORC(FormatFactory & factory); +void registerInputFormatParquet(FormatFactory & factory); +void registerOutputFormatParquet(FormatFactory & factory); +void registerInputFormatArrow(FormatFactory & factory); +void registerOutputFormatArrow(FormatFactory & factory); +void registerInputFormatAvro(FormatFactory & factory); +void registerOutputFormatAvro(FormatFactory & factory); +void registerInputFormatRawBLOB(FormatFactory & factory); +void registerOutputFormatRawBLOB(FormatFactory & factory); +void registerInputFormatCustomSeparated(FormatFactory & factory); +void registerOutputFormatCustomSeparated(FormatFactory & factory); +void registerInputFormatCapnProto(FormatFactory & factory); +void registerOutputFormatCapnProto(FormatFactory & factory); + +/// Output only (presentational) formats. + +void registerOutputFormatPretty(FormatFactory & factory); +void registerOutputFormatPrettyCompact(FormatFactory & factory); +void registerOutputFormatPrettySpace(FormatFactory & factory); +void registerOutputFormatVertical(FormatFactory & factory); +void registerOutputFormatJSONEachRowWithProgress(FormatFactory & factory); +void registerOutputFormatXML(FormatFactory & factory); +void registerOutputFormatODBCDriver2(FormatFactory & factory); +void registerOutputFormatNull(FormatFactory & factory); +void registerOutputFormatMySQLWire(FormatFactory & factory); +void registerOutputFormatMarkdown(FormatFactory & factory); +void registerOutputFormatPostgreSQLWire(FormatFactory & factory); +void registerOutputFormatPrometheus(FormatFactory & factory); +void registerOutputFormatSQLInsert(FormatFactory & factory); + +/// Input only formats. + +void registerInputFormatRegexp(FormatFactory & factory); +void registerInputFormatJSONAsString(FormatFactory & factory); +void registerInputFormatJSONAsObject(FormatFactory & factory); +void registerInputFormatLineAsString(FormatFactory & factory); +void registerInputFormatMySQLDump(FormatFactory & factory); +void registerInputFormatParquetMetadata(FormatFactory & factory); +void registerInputFormatOne(FormatFactory & factory); + +#if USE_HIVE +void registerInputFormatHiveText(FormatFactory & factory); +#endif + +/// Non trivial prefix and suffix checkers for disabling parallel parsing. +void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory); +void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory); +void registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(FormatFactory & factory); + +void registerArrowSchemaReader(FormatFactory & factory); +void registerParquetSchemaReader(FormatFactory & factory); +void registerORCSchemaReader(FormatFactory & factory); +void registerTSVSchemaReader(FormatFactory & factory); +void registerCSVSchemaReader(FormatFactory & factory); +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory); +void registerJSONSchemaReader(FormatFactory & factory); +void registerJSONEachRowSchemaReader(FormatFactory & factory); +void registerJSONObjectEachRowSchemaReader(FormatFactory & factory); +void registerJSONAsStringSchemaReader(FormatFactory & factory); +void registerJSONAsObjectSchemaReader(FormatFactory & factory); +void registerJSONColumnsSchemaReader(FormatFactory & factory); +void registerJSONCompactColumnsSchemaReader(FormatFactory & factory); +void registerJSONColumnsWithMetadataSchemaReader(FormatFactory & factory); +void registerNativeSchemaReader(FormatFactory & factory); +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory); +void registerAvroSchemaReader(FormatFactory & factory); +void registerProtobufSchemaReader(FormatFactory & factory); +void registerProtobufListSchemaReader(FormatFactory & factory); +void registerLineAsStringSchemaReader(FormatFactory & factory); +void registerRawBLOBSchemaReader(FormatFactory & factory); +void registerMsgPackSchemaReader(FormatFactory & factory); +void registerCapnProtoSchemaReader(FormatFactory & factory); +void registerCustomSeparatedSchemaReader(FormatFactory & factory); +void registerRegexpSchemaReader(FormatFactory & factory); +void registerTSKVSchemaReader(FormatFactory & factory); +void registerValuesSchemaReader(FormatFactory & factory); +void registerTemplateSchemaReader(FormatFactory & factory); +void registerMySQLSchemaReader(FormatFactory & factory); +void registerBSONEachRowSchemaReader(FormatFactory & factory); +void registerParquetMetadataSchemaReader(FormatFactory & factory); +void registerOneSchemaReader(FormatFactory & factory); + +void registerFileExtensions(FormatFactory & factory); + +void registerFormats() +{ + auto & factory = FormatFactory::instance(); + + registerFileSegmentationEngineTabSeparated(factory); + registerFileSegmentationEngineCSV(factory); + registerFileSegmentationEngineRegexp(factory); + registerFileSegmentationEngineJSONEachRow(factory); + registerFileSegmentationEngineJSONAsString(factory); + registerFileSegmentationEngineJSONAsObject(factory); + registerFileSegmentationEngineJSONCompactEachRow(factory); +#if USE_HIVE + registerFileSegmentationEngineHiveText(factory); +#endif + registerFileSegmentationEngineLineAsString(factory); + registerFileSegmentationEngineBSONEachRow(factory); + + + registerInputFormatNative(factory); + registerOutputFormatNative(factory); + + registerInputFormatRowBinary(factory); + registerOutputFormatRowBinary(factory); + registerInputFormatTabSeparated(factory); + registerOutputFormatTabSeparated(factory); + registerInputFormatValues(factory); + registerOutputFormatValues(factory); + registerInputFormatCSV(factory); + registerOutputFormatCSV(factory); + registerInputFormatTSKV(factory); + registerOutputFormatTSKV(factory); + registerOutputFormatJSON(factory); + registerInputFormatJSON(factory); + registerOutputFormatJSONCompact(factory); + registerInputFormatJSONCompact(factory); + registerInputFormatJSONEachRow(factory); + registerOutputFormatJSONEachRow(factory); + registerInputFormatJSONObjectEachRow(factory); + registerOutputFormatJSONObjectEachRow(factory); + registerInputFormatJSONCompactEachRow(factory); + registerOutputFormatJSONCompactEachRow(factory); + registerInputFormatJSONColumns(factory); + registerOutputFormatJSONColumns(factory); + registerInputFormatJSONCompactColumns(factory); + registerOutputFormatJSONCompactColumns(factory); + registerInputFormatBSONEachRow(factory); + registerOutputFormatBSONEachRow(factory); + registerInputFormatJSONColumnsWithMetadata(factory); + registerOutputFormatJSONColumnsWithMetadata(factory); + registerInputFormatProtobuf(factory); + registerOutputFormatProtobufList(factory); + registerInputFormatProtobufList(factory); + registerOutputFormatProtobuf(factory); + registerInputFormatTemplate(factory); + registerOutputFormatTemplate(factory); + registerInputFormatMsgPack(factory); + registerOutputFormatMsgPack(factory); + registerInputFormatRawBLOB(factory); + registerOutputFormatRawBLOB(factory); + registerInputFormatCustomSeparated(factory); + registerOutputFormatCustomSeparated(factory); + + registerInputFormatORC(factory); + registerOutputFormatORC(factory); + registerInputFormatParquet(factory); + registerOutputFormatParquet(factory); + registerInputFormatAvro(factory); + registerOutputFormatAvro(factory); + registerInputFormatArrow(factory); + registerOutputFormatArrow(factory); + + registerOutputFormatPretty(factory); + registerOutputFormatPrettyCompact(factory); + registerOutputFormatPrettySpace(factory); + registerOutputFormatVertical(factory); + registerOutputFormatJSONEachRowWithProgress(factory); + registerOutputFormatXML(factory); + registerOutputFormatODBCDriver2(factory); + registerOutputFormatNull(factory); + registerOutputFormatMySQLWire(factory); + registerOutputFormatMarkdown(factory); + registerOutputFormatPostgreSQLWire(factory); + registerOutputFormatCapnProto(factory); + registerOutputFormatPrometheus(factory); + registerOutputFormatSQLInsert(factory); + + registerInputFormatRegexp(factory); + registerInputFormatJSONAsString(factory); + registerInputFormatJSONAsObject(factory); + registerInputFormatLineAsString(factory); +#if USE_HIVE + registerInputFormatHiveText(factory); +#endif + + registerInputFormatCapnProto(factory); + registerInputFormatMySQLDump(factory); + + registerInputFormatParquetMetadata(factory); + registerInputFormatOne(factory); + + registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); + registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); + registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(factory); + + registerArrowSchemaReader(factory); + registerParquetSchemaReader(factory); + registerORCSchemaReader(factory); + registerTSVSchemaReader(factory); + registerCSVSchemaReader(factory); + registerJSONSchemaReader(factory); + registerJSONCompactEachRowSchemaReader(factory); + registerJSONEachRowSchemaReader(factory); + registerJSONObjectEachRowSchemaReader(factory); + registerJSONAsStringSchemaReader(factory); + registerJSONAsObjectSchemaReader(factory); + registerJSONColumnsSchemaReader(factory); + registerJSONCompactColumnsSchemaReader(factory); + registerJSONColumnsWithMetadataSchemaReader(factory); + registerNativeSchemaReader(factory); + registerRowBinaryWithNamesAndTypesSchemaReader(factory); + registerAvroSchemaReader(factory); + registerProtobufSchemaReader(factory); + registerProtobufListSchemaReader(factory); + registerLineAsStringSchemaReader(factory); + registerRawBLOBSchemaReader(factory); + registerMsgPackSchemaReader(factory); + registerCapnProtoSchemaReader(factory); + registerCustomSeparatedSchemaReader(factory); + registerRegexpSchemaReader(factory); + registerTSKVSchemaReader(factory); + registerValuesSchemaReader(factory); + registerTemplateSchemaReader(factory); + registerMySQLSchemaReader(factory); + registerBSONEachRowSchemaReader(factory); + registerParquetMetadataSchemaReader(factory); + registerOneSchemaReader(factory); +} + +} + diff --git a/contrib/clickhouse/src/Formats/registerFormats.h b/contrib/clickhouse/src/Formats/registerFormats.h new file mode 100644 index 0000000000..e4ff79248d --- /dev/null +++ b/contrib/clickhouse/src/Formats/registerFormats.h @@ -0,0 +1,9 @@ +#pragma once + +namespace DB +{ + +void registerFormats(); + +} + diff --git a/contrib/clickhouse/src/Formats/registerWithNamesAndTypes.cpp b/contrib/clickhouse/src/Formats/registerWithNamesAndTypes.cpp new file mode 100644 index 0000000000..674865a3be --- /dev/null +++ b/contrib/clickhouse/src/Formats/registerWithNamesAndTypes.cpp @@ -0,0 +1,20 @@ +#include <Formats/registerWithNamesAndTypes.h> + +namespace DB +{ + +void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func) +{ + register_func(base_format_name, false, false); + register_func(base_format_name + "WithNames", true, false); + register_func(base_format_name + "WithNamesAndTypes", true, true); +} + +void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory) +{ + auto setting_checker = [](const FormatSettings & settings){ return settings.with_names_use_header; }; + factory.registerSubsetOfColumnsSupportChecker(base_format_name + "WithNames", setting_checker); + factory.registerSubsetOfColumnsSupportChecker(base_format_name + "WithNamesAndTypes", setting_checker); +} + +} diff --git a/contrib/clickhouse/src/Formats/registerWithNamesAndTypes.h b/contrib/clickhouse/src/Formats/registerWithNamesAndTypes.h new file mode 100644 index 0000000000..50a0eee961 --- /dev/null +++ b/contrib/clickhouse/src/Formats/registerWithNamesAndTypes.h @@ -0,0 +1,15 @@ +#pragma once + +#include <string> +#include <functional> +#include <Formats/FormatFactory.h> + +namespace DB +{ + +using RegisterWithNamesAndTypesFunc = std::function<void(const std::string & format_name, bool with_names, bool with_types)>; +void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func); + +void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory); + +} diff --git a/contrib/clickhouse/src/Formats/verbosePrintString.cpp b/contrib/clickhouse/src/Formats/verbosePrintString.cpp new file mode 100644 index 0000000000..5c6111c292 --- /dev/null +++ b/contrib/clickhouse/src/Formats/verbosePrintString.cpp @@ -0,0 +1,64 @@ +#include <Formats/verbosePrintString.h> +#include <base/hex.h> +#include <IO/Operators.h> + + +namespace DB +{ + +void verbosePrintString(const char * begin, const char * end, WriteBuffer & out) +{ + if (end == begin) + { + out << "<EMPTY>"; + return; + } + + out << "\""; + + for (const char * pos = begin; pos < end; ++pos) + { + switch (*pos) + { + case '\0': + out << "<ASCII NUL>"; + break; + case '\b': + out << "<BACKSPACE>"; + break; + case '\f': + out << "<FORM FEED>"; + break; + case '\n': + out << "<LINE FEED>"; + break; + case '\r': + out << "<CARRIAGE RETURN>"; + break; + case '\t': + out << "<TAB>"; + break; + case '\\': + out << "<BACKSLASH>"; + break; + case '"': + out << "<DOUBLE QUOTE>"; + break; + case '\'': + out << "<SINGLE QUOTE>"; + break; + + default: + { + if (static_cast<unsigned char>(*pos) < 32) /// ASCII control characters + out << "<0x" << hexDigitUppercase(*pos / 16) << hexDigitUppercase(*pos % 16) << ">"; + else + out << *pos; + } + } + } + + out << "\""; +} + +} diff --git a/contrib/clickhouse/src/Formats/verbosePrintString.h b/contrib/clickhouse/src/Formats/verbosePrintString.h new file mode 100644 index 0000000000..26bd663d55 --- /dev/null +++ b/contrib/clickhouse/src/Formats/verbosePrintString.h @@ -0,0 +1,14 @@ +#pragma once + + +namespace DB +{ + +class WriteBuffer; + + +/** Print string in double quotes and with control characters in "<NAME>" form - for output diagnostic info to user. + */ +void verbosePrintString(const char * begin, const char * end, WriteBuffer & out); + +} |