diff options
| author | vitalyisaev <[email protected]> | 2023-11-14 09:58:56 +0300 |
|---|---|---|
| committer | vitalyisaev <[email protected]> | 2023-11-14 10:20:20 +0300 |
| commit | c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch) | |
| tree | cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/DataTypes | |
| parent | d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff) | |
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/DataTypes')
147 files changed, 21122 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/DataTypes/DataTypeAggregateFunction.cpp b/contrib/clickhouse/src/DataTypes/DataTypeAggregateFunction.cpp new file mode 100644 index 00000000000..be60886d74b --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeAggregateFunction.cpp @@ -0,0 +1,271 @@ +#include <IO/WriteHelpers.h> +#include <IO/ReadHelpers.h> + +#include <Columns/ColumnAggregateFunction.h> + +#include <Common/AlignedBuffer.h> +#include <Common/FieldVisitorToString.h> + +#include <Formats/FormatSettings.h> +#include <DataTypes/DataTypeAggregateFunction.h> +#include <DataTypes/Serializations/SerializationAggregateFunction.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/transformTypesRecursively.h> +#include <IO/WriteBufferFromString.h> +#include <IO/Operators.h> + +#include <AggregateFunctions/AggregateFunctionFactory.h> +#include <Parsers/ASTFunction.h> +#include <Parsers/ASTIdentifier_fwd.h> +#include <Parsers/ASTLiteral.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; + extern const int BAD_ARGUMENTS; + extern const int PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int LOGICAL_ERROR; +} + + +String DataTypeAggregateFunction::doGetName() const +{ + return getNameImpl(true); +} + + +String DataTypeAggregateFunction::getNameWithoutVersion() const +{ + return getNameImpl(false); +} + + +size_t DataTypeAggregateFunction::getVersion() const +{ + if (version) + return *version; + return function->getDefaultVersion(); +} + + +String DataTypeAggregateFunction::getNameImpl(bool with_version) const +{ + WriteBufferFromOwnString stream; + stream << "AggregateFunction("; + + /// If aggregate function does not support versioning its version is 0 and is not printed. + auto data_type_version = getVersion(); + if (with_version && data_type_version) + stream << data_type_version << ", "; + stream << function->getName(); + + if (!parameters.empty()) + { + stream << '('; + for (size_t i = 0, size = parameters.size(); i < size; ++i) + { + if (i) + stream << ", "; + stream << applyVisitor(FieldVisitorToString(), parameters[i]); + } + stream << ')'; + } + + for (const auto & argument_type : argument_types) + stream << ", " << argument_type->getName(); + + stream << ')'; + return stream.str(); +} + + +MutableColumnPtr DataTypeAggregateFunction::createColumn() const +{ + return ColumnAggregateFunction::create(function, getVersion()); +} + + +/// Create empty state +Field DataTypeAggregateFunction::getDefault() const +{ + Field field = AggregateFunctionStateData(); + field.get<AggregateFunctionStateData &>().name = getName(); + + AlignedBuffer place_buffer(function->sizeOfData(), function->alignOfData()); + AggregateDataPtr place = place_buffer.data(); + + function->create(place); + + try + { + WriteBufferFromString buffer_from_field(field.get<AggregateFunctionStateData &>().data); + function->serialize(place, buffer_from_field, version); + } + catch (...) + { + function->destroy(place); + throw; + } + + function->destroy(place); + + return field; +} + +bool DataTypeAggregateFunction::strictEquals(const DataTypePtr & lhs_state_type, const DataTypePtr & rhs_state_type) +{ + const auto * lhs_state = typeid_cast<const DataTypeAggregateFunction *>(lhs_state_type.get()); + const auto * rhs_state = typeid_cast<const DataTypeAggregateFunction *>(rhs_state_type.get()); + + if (!lhs_state || !rhs_state) + return false; + + if (lhs_state->function->getName() != rhs_state->function->getName()) + return false; + + if (lhs_state->parameters.size() != rhs_state->parameters.size()) + return false; + + for (size_t i = 0; i < lhs_state->parameters.size(); ++i) + if (lhs_state->parameters[i] != rhs_state->parameters[i]) + return false; + + if (lhs_state->argument_types.size() != rhs_state->argument_types.size()) + return false; + + for (size_t i = 0; i < lhs_state->argument_types.size(); ++i) + if (!lhs_state->argument_types[i]->equals(*rhs_state->argument_types[i])) + return false; + + return true; +} + +bool DataTypeAggregateFunction::equals(const IDataType & rhs) const +{ + if (typeid(rhs) != typeid(*this)) + return false; + + auto lhs_state_type = function->getNormalizedStateType(); + auto rhs_state_type = typeid_cast<const DataTypeAggregateFunction &>(rhs).function->getNormalizedStateType(); + + return strictEquals(lhs_state_type, rhs_state_type); +} + + +SerializationPtr DataTypeAggregateFunction::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationAggregateFunction>(function, getName(), getVersion()); +} + + +static DataTypePtr create(const ASTPtr & arguments) +{ + String function_name; + AggregateFunctionPtr function; + DataTypes argument_types; + Array params_row; + std::optional<size_t> version; + + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Data type AggregateFunction requires parameters: " + "version(optionally), name of aggregate function and list of data types for arguments"); + + ASTPtr data_type_ast = arguments->children[0]; + size_t argument_types_start_idx = 1; + + /* If aggregate function definition doesn't have version, it will have in AST children args [ASTFunction, types...] - in case + * it is parametric, or [ASTIdentifier, types...] - otherwise. If aggregate function has version in AST, then it will be: + * [ASTLiteral, ASTFunction (or ASTIdentifier), types...]. + */ + if (auto * version_ast = arguments->children[0]->as<ASTLiteral>()) + { + if (arguments->children.size() < 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Data type AggregateFunction has version, but it requires at least one more parameter - name of aggregate function"); + version = version_ast->value.safeGet<UInt64>(); + data_type_ast = arguments->children[1]; + argument_types_start_idx = 2; + } + + if (const auto * parametric = data_type_ast->as<ASTFunction>()) + { + if (parametric->parameters) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Unexpected level of parameters to aggregate function"); + + function_name = parametric->name; + + if (parametric->arguments) + { + const ASTs & parameters = parametric->arguments->children; + params_row.resize(parameters.size()); + + for (size_t i = 0; i < parameters.size(); ++i) + { + const auto * literal = parameters[i]->as<ASTLiteral>(); + if (!literal) + throw Exception( + ErrorCodes::PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS, + "Parameters to aggregate functions must be literals. " + "Got parameter '{}' for function '{}'", + parameters[i]->formatForErrorMessage(), function_name); + + params_row[i] = literal->value; + } + } + } + else if (auto opt_name = tryGetIdentifierName(data_type_ast)) + { + function_name = *opt_name; + } + else if (data_type_ast->as<ASTLiteral>()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Aggregate function name for data type AggregateFunction must " + "be passed as identifier (without quotes) or function"); + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unexpected AST element passed as aggregate function name for data type AggregateFunction. " + "Must be identifier or function."); + + for (size_t i = argument_types_start_idx; i < arguments->children.size(); ++i) + argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i])); + + if (function_name.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed"); + + AggregateFunctionProperties properties; + function = AggregateFunctionFactory::instance().get(function_name, argument_types, params_row, properties); + return std::make_shared<DataTypeAggregateFunction>(function, argument_types, params_row, version); +} + +void setVersionToAggregateFunctions(DataTypePtr & type, bool if_empty, std::optional<size_t> revision) +{ + auto callback = [revision, if_empty](DataTypePtr & column_type) + { + const auto * aggregate_function_type = typeid_cast<const DataTypeAggregateFunction *>(column_type.get()); + if (aggregate_function_type && aggregate_function_type->isVersioned()) + { + if (revision) + aggregate_function_type->updateVersionFromRevision(*revision, if_empty); + else + aggregate_function_type->setVersion(0, if_empty); + } + }; + + callOnNestedSimpleTypes(type, callback); +} + + +void registerDataTypeAggregateFunction(DataTypeFactory & factory) +{ + factory.registerDataType("AggregateFunction", create); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeAggregateFunction.h b/contrib/clickhouse/src/DataTypes/DataTypeAggregateFunction.h new file mode 100644 index 00000000000..6331c23222f --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeAggregateFunction.h @@ -0,0 +1,96 @@ +#pragma once + +#include <AggregateFunctions/IAggregateFunction.h> + +#include <DataTypes/IDataType.h> + + +namespace DB +{ + +/** Type - the state of the aggregate function. + * Type parameters is an aggregate function, the types of its arguments, and its parameters (for parametric aggregate functions). + * + * Data type can support versioning for serialization of aggregate function state. + * Version 0 also means no versioning. When a table with versioned data type is attached, its version is parsed from AST. If + * there is no version in AST, then it is either attach with no version in metadata (then version is 0) or it + * is a new data type (then version is default - latest). + */ +class DataTypeAggregateFunction final : public IDataType +{ +private: + AggregateFunctionPtr function; + DataTypes argument_types; + Array parameters; + mutable std::optional<size_t> version; + + String getNameImpl(bool with_version) const; + size_t getVersion() const; + +public: + static constexpr bool is_parametric = true; + + DataTypeAggregateFunction(AggregateFunctionPtr function_, const DataTypes & argument_types_, + const Array & parameters_, std::optional<size_t> version_ = std::nullopt) + : function(std::move(function_)) + , argument_types(argument_types_) + , parameters(parameters_) + , version(version_) + { + } + + String getFunctionName() const { return function->getName(); } + AggregateFunctionPtr getFunction() const { return function; } + + String doGetName() const override; + String getNameWithoutVersion() const; + const char * getFamilyName() const override { return "AggregateFunction"; } + String getSQLCompatibleName() const override { return "TEXT"; } + TypeIndex getTypeId() const override { return TypeIndex::AggregateFunction; } + + Array getParameters() const { return parameters; } + + bool canBeInsideNullable() const override { return false; } + + DataTypePtr getReturnType() const { return function->getResultType(); } + DataTypePtr getReturnTypeToPredict() const { return function->getReturnTypeToPredict(); } + DataTypes getArgumentsDataTypes() const { return argument_types; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + static bool strictEquals(const DataTypePtr & lhs_state_type, const DataTypePtr & rhs_state_type); + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return false; } + bool shouldAlignRightInPrettyFormats() const override { return false; } + + SerializationPtr doGetDefaultSerialization() const override; + bool supportsSparseSerialization() const override { return false; } + + bool isVersioned() const { return function->isVersioned(); } + + /// Version is not empty only if it was parsed from AST or implicitly cast to 0 or version according + /// to server revision. + /// It is ok to have an empty version value here - then for serialization a default (latest) + /// version is used. This method is used to force some zero version to be used instead of + /// default, or to set version for serialization in distributed queries. + void setVersion(size_t version_, bool if_empty) const + { + if (version && if_empty) + return; + + version = version_; + } + + void updateVersionFromRevision(size_t revision, bool if_empty) const + { + setVersion(function->getVersionFromRevision(revision), if_empty); + } +}; + +void setVersionToAggregateFunctions(DataTypePtr & type, bool if_empty, std::optional<size_t> revision = std::nullopt); + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeArray.cpp b/contrib/clickhouse/src/DataTypes/DataTypeArray.cpp new file mode 100644 index 00000000000..e31f10046b7 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeArray.cpp @@ -0,0 +1,77 @@ +#include <Columns/ColumnArray.h> + +#include <Formats/FormatSettings.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationArray.h> + +#include <Parsers/IAST.h> + +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> + +#include <Core/NamesAndTypes.h> +#include <Columns/ColumnConst.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} +using FieldType = Array; + + +DataTypeArray::DataTypeArray(const DataTypePtr & nested_) + : nested{nested_} +{ +} + + +MutableColumnPtr DataTypeArray::createColumn() const +{ + return ColumnArray::create(nested->createColumn(), ColumnArray::ColumnOffsets::create()); +} + +Field DataTypeArray::getDefault() const +{ + return Array(); +} + + +bool DataTypeArray::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this) && nested->equals(*static_cast<const DataTypeArray &>(rhs).nested); +} + +SerializationPtr DataTypeArray::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationArray>(nested->getDefaultSerialization()); +} + +size_t DataTypeArray::getNumberOfDimensions() const +{ + const DataTypeArray * nested_array = typeid_cast<const DataTypeArray *>(nested.get()); + if (!nested_array) + return 1; + return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion. +} + + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Array data type family must have exactly one argument - type of elements"); + + return std::make_shared<DataTypeArray>(DataTypeFactory::instance().get(arguments->children[0])); +} + + +void registerDataTypeArray(DataTypeFactory & factory) +{ + factory.registerDataType("Array", create); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeArray.h b/contrib/clickhouse/src/DataTypes/DataTypeArray.h new file mode 100644 index 00000000000..68b574b8ded --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeArray.h @@ -0,0 +1,74 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <DataTypes/Serializations/SerializationArray.h> +#include <Columns/ColumnArray.h> + + +namespace DB +{ + + +class DataTypeArray final : public IDataType +{ +private: + /// The type of array elements. + DataTypePtr nested; + +public: + using FieldType = Array; + using ColumnType = ColumnArray; + static constexpr bool is_parametric = true; + + explicit DataTypeArray(const DataTypePtr & nested_); + + TypeIndex getTypeId() const override { return TypeIndex::Array; } + + std::string doGetName() const override + { + return "Array(" + nested->getName() + ")"; + } + + const char * getFamilyName() const override + { + return "Array"; + } + String getSQLCompatibleName() const override + { + return "TEXT"; + } + + bool canBeInsideNullable() const override + { + return false; + } + + MutableColumnPtr createColumn() const override; + + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return true; } + bool cannotBeStoredInTables() const override { return nested->cannotBeStoredInTables(); } + bool textCanContainOnlyValidUTF8() const override { return nested->textCanContainOnlyValidUTF8(); } + bool isComparable() const override { return nested->isComparable(); } + bool canBeComparedWithCollation() const override { return nested->canBeComparedWithCollation(); } + bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); } + + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override + { + return nested->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion(); + } + + SerializationPtr doGetDefaultSerialization() const override; + + const DataTypePtr & getNestedType() const { return nested; } + + /// 1 for plain array, 2 for array of arrays and so on. + size_t getNumberOfDimensions() const; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeCustom.h b/contrib/clickhouse/src/DataTypes/DataTypeCustom.h new file mode 100644 index 00000000000..cf1e943d8e9 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeCustom.h @@ -0,0 +1,56 @@ +#pragma once + +#include <memory> +#include <cstddef> +#include <Core/Types_fwd.h> +#include <DataTypes/Serializations/ISerialization.h> + +namespace DB +{ + +class ReadBuffer; +class WriteBuffer; +struct FormatSettings; +class IColumn; + +/** Allow to customize an existing data type and set a different name and/or text serialization/deserialization methods. + * See use in IPv4 and IPv6 data types, and also in SimpleAggregateFunction. + */ +class IDataTypeCustomName +{ +public: + virtual ~IDataTypeCustomName() = default; + + virtual String getName() const = 0; +}; + +using DataTypeCustomNamePtr = std::unique_ptr<const IDataTypeCustomName>; + +/** Describe a data type customization + */ +struct DataTypeCustomDesc +{ + DataTypeCustomNamePtr name; + SerializationPtr serialization; + + explicit DataTypeCustomDesc( + DataTypeCustomNamePtr name_, + SerializationPtr serialization_ = nullptr) + : name(std::move(name_)) + , serialization(std::move(serialization_)) {} +}; + +using DataTypeCustomDescPtr = std::unique_ptr<DataTypeCustomDesc>; + +/** A simple implementation of IDataTypeCustomName + */ +class DataTypeCustomFixedName : public IDataTypeCustomName +{ +private: + String name; +public: + explicit DataTypeCustomFixedName(String name_) : name(name_) {} + String getName() const override { return name; } +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeCustomGeo.cpp b/contrib/clickhouse/src/DataTypes/DataTypeCustomGeo.cpp new file mode 100644 index 00000000000..f7d05fa3be6 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeCustomGeo.cpp @@ -0,0 +1,43 @@ +#include <DataTypes/DataTypeCustomGeo.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeCustom.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypesNumber.h> + +namespace DB +{ + +void registerDataTypeDomainGeo(DataTypeFactory & factory) +{ + // Custom type for point represented as its coordinates stored as Tuple(Float64, Float64) + factory.registerSimpleDataTypeCustom("Point", [] + { + return std::make_pair(DataTypeFactory::instance().get("Tuple(Float64, Float64)"), + std::make_unique<DataTypeCustomDesc>(std::make_unique<DataTypePointName>())); + }); + + // Custom type for simple polygon without holes stored as Array(Point) + factory.registerSimpleDataTypeCustom("Ring", [] + { + return std::make_pair(DataTypeFactory::instance().get("Array(Point)"), + std::make_unique<DataTypeCustomDesc>(std::make_unique<DataTypeRingName>())); + }); + + // Custom type for polygon with holes stored as Array(Ring) + // First element of outer array is outer shape of polygon and all the following are holes + factory.registerSimpleDataTypeCustom("Polygon", [] + { + return std::make_pair(DataTypeFactory::instance().get("Array(Ring)"), + std::make_unique<DataTypeCustomDesc>(std::make_unique<DataTypePolygonName>())); + }); + + // Custom type for multiple polygons with holes stored as Array(Polygon) + factory.registerSimpleDataTypeCustom("MultiPolygon", [] + { + return std::make_pair(DataTypeFactory::instance().get("Array(Polygon)"), + std::make_unique<DataTypeCustomDesc>(std::make_unique<DataTypeMultiPolygonName>())); + }); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeCustomGeo.h b/contrib/clickhouse/src/DataTypes/DataTypeCustomGeo.h new file mode 100644 index 00000000000..c2a83b3e577 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeCustomGeo.h @@ -0,0 +1,32 @@ +#pragma once + +#include <DataTypes/DataTypeCustom.h> + +namespace DB +{ + +class DataTypePointName : public DataTypeCustomFixedName +{ +public: + DataTypePointName() : DataTypeCustomFixedName("Point") {} +}; + +class DataTypeRingName : public DataTypeCustomFixedName +{ +public: + DataTypeRingName() : DataTypeCustomFixedName("Ring") {} +}; + +class DataTypePolygonName : public DataTypeCustomFixedName +{ +public: + DataTypePolygonName() : DataTypeCustomFixedName("Polygon") {} +}; + +class DataTypeMultiPolygonName : public DataTypeCustomFixedName +{ +public: + DataTypeMultiPolygonName() : DataTypeCustomFixedName("MultiPolygon") {} +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp b/contrib/clickhouse/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp new file mode 100644 index 00000000000..4e50be0a0cc --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp @@ -0,0 +1,170 @@ +#include <Common/FieldVisitorToString.h> +#include <Common/typeid_cast.h> + +#include <DataTypes/DataTypeCustomSimpleAggregateFunction.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeFactory.h> + +#include <AggregateFunctions/AggregateFunctionFactory.h> +#include <Parsers/ASTFunction.h> +#include <Parsers/ASTIdentifier.h> +#include <Parsers/ASTLiteral.h> +#include <Parsers/ASTSelectWithUnionQuery.h> + +#include <boost/algorithm/string/join.hpp> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; + extern const int BAD_ARGUMENTS; + extern const int PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int LOGICAL_ERROR; +} + +void DataTypeCustomSimpleAggregateFunction::checkSupportedFunctions(const AggregateFunctionPtr & function) +{ + /// TODO Make it sane. + static const std::vector<String> supported_functions{ + "any", + "anyLast", + "min", + "max", + "sum", + "sumWithOverflow", + "groupBitAnd", + "groupBitOr", + "groupBitXor", + "sumMap", + "minMap", + "maxMap", + "groupArrayArray", + "groupArrayLastArray", + "groupUniqArrayArray", + "sumMappedArrays", + "minMappedArrays", + "maxMappedArrays", + }; + + // check function + if (std::find(std::begin(supported_functions), std::end(supported_functions), function->getName()) == std::end(supported_functions)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported aggregate function {}, supported functions are {}", + function->getName(), boost::algorithm::join(supported_functions, ",")); + } +} + +String DataTypeCustomSimpleAggregateFunction::getName() const +{ + WriteBufferFromOwnString stream; + stream << "SimpleAggregateFunction(" << function->getName(); + + if (!parameters.empty()) + { + stream << "("; + for (size_t i = 0; i < parameters.size(); ++i) + { + if (i) + stream << ", "; + stream << applyVisitor(FieldVisitorToString(), parameters[i]); + } + stream << ")"; + } + + for (const auto & argument_type : argument_types) + stream << ", " << argument_type->getName(); + + stream << ")"; + return stream.str(); +} + + +static std::pair<DataTypePtr, DataTypeCustomDescPtr> create(const ASTPtr & arguments) +{ + String function_name; + AggregateFunctionPtr function; + DataTypes argument_types; + Array params_row; + + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Data type SimpleAggregateFunction requires parameters: " + "name of aggregate function and list of data types for arguments"); + + if (const ASTFunction * parametric = arguments->children[0]->as<ASTFunction>()) + { + if (parametric->parameters) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Unexpected level of parameters to aggregate function"); + function_name = parametric->name; + + if (parametric->arguments) + { + const ASTs & parameters = parametric->arguments->as<ASTExpressionList &>().children; + params_row.resize(parameters.size()); + + for (size_t i = 0; i < parameters.size(); ++i) + { + const ASTLiteral * lit = parameters[i]->as<ASTLiteral>(); + if (!lit) + throw Exception( + ErrorCodes::PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS, + "Parameters to aggregate functions must be literals. " + "Got parameter '{}' for function '{}'", + parameters[i]->formatForErrorMessage(), + function_name); + + params_row[i] = lit->value; + } + } + } + else if (auto opt_name = tryGetIdentifierName(arguments->children[0])) + { + function_name = *opt_name; + } + else if (arguments->children[0]->as<ASTLiteral>()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Aggregate function name for data type SimpleAggregateFunction must " + "be passed as identifier (without quotes) or function"); + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unexpected AST element passed as aggregate function name for data type " + "SimpleAggregateFunction. Must be identifier or function."); + + for (size_t i = 1; i < arguments->children.size(); ++i) + argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i])); + + if (function_name.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed"); + + AggregateFunctionProperties properties; + function = AggregateFunctionFactory::instance().get(function_name, argument_types, params_row, properties); + + DataTypeCustomSimpleAggregateFunction::checkSupportedFunctions(function); + + DataTypePtr storage_type = DataTypeFactory::instance().get(argument_types[0]->getName()); + + if (!function->getResultType()->equals(*removeLowCardinality(storage_type))) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incompatible data types between aggregate function '{}' " + "which returns {} and column storage type {}", + function->getName(), function->getResultType()->getName(), storage_type->getName()); + } + + DataTypeCustomNamePtr custom_name = std::make_unique<DataTypeCustomSimpleAggregateFunction>(function, argument_types, params_row); + + return std::make_pair(storage_type, std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr)); +} + +void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory) +{ + factory.registerDataTypeCustom("SimpleAggregateFunction", create); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeCustomSimpleAggregateFunction.h b/contrib/clickhouse/src/DataTypes/DataTypeCustomSimpleAggregateFunction.h new file mode 100644 index 00000000000..926dfd9cc82 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeCustomSimpleAggregateFunction.h @@ -0,0 +1,42 @@ +#pragma once + +#include <DataTypes/DataTypeCustom.h> +#include <AggregateFunctions/IAggregateFunction.h> + +#include <IO/ReadHelpers.h> + +namespace DB +{ + +/** The type SimpleAggregateFunction(fct, type) is meant to be used in an AggregatingMergeTree. It behaves like a standard + * data type but when rows are merged, an aggregation function is applied. + * + * The aggregation function is limited to simple functions whose merge state is the final result: + * any, anyLast, min, max, sum + * + * Examples: + * + * SimpleAggregateFunction(sum, Nullable(Float64)) + * SimpleAggregateFunction(anyLast, LowCardinality(Nullable(String))) + * SimpleAggregateFunction(anyLast, IPv4) + * + * Technically, a standard IDataType is instantiated and customized with IDataTypeCustomName and DataTypeCustomDesc. + */ + +class DataTypeCustomSimpleAggregateFunction : public IDataTypeCustomName +{ +private: + const AggregateFunctionPtr function; + const DataTypes argument_types; + const Array parameters; + +public: + DataTypeCustomSimpleAggregateFunction(const AggregateFunctionPtr & function_, const DataTypes & argument_types_, const Array & parameters_) + : function(function_), argument_types(argument_types_), parameters(parameters_) {} + + AggregateFunctionPtr getFunction() const { return function; } + String getName() const override; + static void checkSupportedFunctions(const AggregateFunctionPtr & function); +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDate.cpp b/contrib/clickhouse/src/DataTypes/DataTypeDate.cpp new file mode 100644 index 00000000000..ee4b0065e59 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDate.cpp @@ -0,0 +1,23 @@ +#include <DataTypes/DataTypeDate.h> +#include <DataTypes/Serializations/SerializationDate.h> +#include <DataTypes/DataTypeFactory.h> + +namespace DB +{ + +bool DataTypeDate::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this); +} + +SerializationPtr DataTypeDate::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationDate>(); +} + +void registerDataTypeDate(DataTypeFactory & factory) +{ + factory.registerSimpleDataType("Date", [] { return DataTypePtr(std::make_shared<DataTypeDate>()); }, DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDate.h b/contrib/clickhouse/src/DataTypes/DataTypeDate.h new file mode 100644 index 00000000000..0d557cad5f0 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDate.h @@ -0,0 +1,27 @@ +#pragma once + +#include <DataTypes/DataTypeNumberBase.h> + + +namespace DB +{ + +class DataTypeDate final : public DataTypeNumberBase<UInt16> +{ +public: + static constexpr auto family_name = "Date"; + + TypeIndex getTypeId() const override { return TypeIndex::Date; } + const char * getFamilyName() const override { return family_name; } + String getSQLCompatibleName() const override { return "DATE"; } + + bool canBeUsedAsVersion() const override { return true; } + bool canBeInsideNullable() const override { return true; } + + bool equals(const IDataType & rhs) const override; + +protected: + SerializationPtr doGetDefaultSerialization() const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDate32.cpp b/contrib/clickhouse/src/DataTypes/DataTypeDate32.cpp new file mode 100644 index 00000000000..83b1260eb6d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDate32.cpp @@ -0,0 +1,23 @@ +#include <DataTypes/DataTypeDate32.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationDate32.h> + +namespace DB +{ +bool DataTypeDate32::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this); +} + +SerializationPtr DataTypeDate32::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationDate32>(); +} + +void registerDataTypeDate32(DataTypeFactory & factory) +{ + factory.registerSimpleDataType( + "Date32", [] { return DataTypePtr(std::make_shared<DataTypeDate32>()); }, DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDate32.h b/contrib/clickhouse/src/DataTypes/DataTypeDate32.h new file mode 100644 index 00000000000..0879a404179 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDate32.h @@ -0,0 +1,31 @@ +#pragma once + +#include <Core/Field.h> +#include <Common/DateLUT.h> +#include <DataTypes/DataTypeNumberBase.h> + +namespace DB +{ +class DataTypeDate32 final : public DataTypeNumberBase<Int32> +{ +public: + static constexpr auto family_name = "Date32"; + + TypeIndex getTypeId() const override { return TypeIndex::Date32; } + const char * getFamilyName() const override { return family_name; } + String getSQLCompatibleName() const override { return "DATE"; } + + Field getDefault() const override + { + return -static_cast<Int64>(DateLUT::instance().getDayNumOffsetEpoch()); + } + + bool canBeUsedAsVersion() const override { return true; } + bool canBeInsideNullable() const override { return true; } + + bool equals(const IDataType & rhs) const override; + +protected: + SerializationPtr doGetDefaultSerialization() const override; +}; +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDateTime.cpp b/contrib/clickhouse/src/DataTypes/DataTypeDateTime.cpp new file mode 100644 index 00000000000..c7722e1c1d9 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDateTime.cpp @@ -0,0 +1,42 @@ +#include <DataTypes/DataTypeDateTime.h> +#include <DataTypes/Serializations/SerializationDateTime.h> + +#include <IO/Operators.h> +#include <IO/WriteBufferFromString.h> + +namespace DB +{ + +DataTypeDateTime::DataTypeDateTime(const String & time_zone_name) + : TimezoneMixin(time_zone_name) +{ +} + +DataTypeDateTime::DataTypeDateTime(const TimezoneMixin & time_zone_) + : TimezoneMixin(time_zone_) +{ +} + +String DataTypeDateTime::doGetName() const +{ + if (!has_explicit_time_zone) + return "DateTime"; + + WriteBufferFromOwnString out; + out << "DateTime(" << quote << time_zone.getTimeZone() << ")"; + return out.str(); +} + +bool DataTypeDateTime::equals(const IDataType & rhs) const +{ + /// DateTime with different timezones are equal, because: + /// "all types with different time zones are equivalent and may be used interchangingly." + return typeid(rhs) == typeid(*this); +} + +SerializationPtr DataTypeDateTime::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationDateTime>(*this); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDateTime.h b/contrib/clickhouse/src/DataTypes/DataTypeDateTime.h new file mode 100644 index 00000000000..a473aae1faf --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDateTime.h @@ -0,0 +1,53 @@ +#pragma once + +#include <Core/Types.h> +#include <DataTypes/DataTypeNumberBase.h> +#include <DataTypes/TimezoneMixin.h> + +namespace DB +{ + +/** DateTime stores time as unix timestamp. + * The value itself is independent of time zone. + * + * In binary format it is represented as unix timestamp. + * In text format it is serialized to and parsed from YYYY-MM-DD hh:mm:ss format. + * The text format is dependent of time zone. + * + * To cast from/to text format, time zone may be specified explicitly or implicit time zone may be used. + * + * Time zone may be specified explicitly as type parameter, example: DateTime('Pacific/Pitcairn'). + * As it does not affect the internal representation of values, + * all types with different time zones are equivalent and may be used interchangingly. + * Time zone only affects parsing and displaying in text formats. + * + * If time zone is not specified (example: DateTime without parameter), + * then `session_timezone` setting value is used. + * If `session_timezone` is not set (or empty string), server default time zone is used. + * Default time zone is server time zone, if server is doing transformations + * and if client is doing transformations, unless 'use_client_time_zone' setting is passed to client; + * Server time zone is the time zone specified in 'timezone' parameter in configuration file, + * or system time zone at the moment of server startup. + */ +class DataTypeDateTime final : public DataTypeNumberBase<UInt32>, public TimezoneMixin +{ +public: + explicit DataTypeDateTime(const String & time_zone_name = ""); + explicit DataTypeDateTime(const TimezoneMixin & time_zone); + + static constexpr auto family_name = "DateTime"; + + const char * getFamilyName() const override { return family_name; } + String getSQLCompatibleName() const override { return "DATETIME"; } + String doGetName() const override; + TypeIndex getTypeId() const override { return TypeIndex::DateTime; } + + bool canBeUsedAsVersion() const override { return true; } + bool canBeInsideNullable() const override { return true; } + + bool equals(const IDataType & rhs) const override; + + SerializationPtr doGetDefaultSerialization() const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDateTime64.cpp b/contrib/clickhouse/src/DataTypes/DataTypeDateTime64.cpp new file mode 100644 index 00000000000..124fea1f458 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDateTime64.cpp @@ -0,0 +1,70 @@ +#include <DataTypes/DataTypeDateTime64.h> +#include <DataTypes/Serializations/SerializationDateTime64.h> +#include <IO/Operators.h> +#include <IO/WriteBufferFromString.h> +#include <optional> +#include <string> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int LOGICAL_ERROR; +} + +static constexpr UInt32 max_scale = 9; + +DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const std::string & time_zone_name) + : DataTypeDecimalBase<DateTime64>(DecimalUtils::max_precision<DateTime64>, scale_), + TimezoneMixin(time_zone_name) +{ + if (scale > max_scale) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale {} is too large for DateTime64. " + "Maximum is up to nanoseconds (9).", std::to_string(scale)); +} + +DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info) + : DataTypeDecimalBase<DateTime64>(DecimalUtils::max_precision<DateTime64>, scale_), + TimezoneMixin(time_zone_info) +{ + if (scale > max_scale) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale {} is too large for DateTime64. " + "Maximum is up to nanoseconds (9).", std::to_string(scale)); +} + +std::string DataTypeDateTime64::doGetName() const +{ + if (!has_explicit_time_zone) + return std::string(getFamilyName()) + "(" + std::to_string(this->scale) + ")"; + + WriteBufferFromOwnString out; + out << "DateTime64(" << this->scale << ", " << quote << time_zone.getTimeZone() << ")"; + return out.str(); +} + +bool DataTypeDateTime64::equals(const IDataType & rhs) const +{ + if (const auto * ptype = typeid_cast<const DataTypeDateTime64 *>(&rhs)) + return this->scale == ptype->getScale(); + return false; +} + +SerializationPtr DataTypeDateTime64::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationDateTime64>(scale, *this); +} + +std::string getDateTimeTimezone(const IDataType & data_type) +{ + if (const auto * type = typeid_cast<const DataTypeDateTime *>(&data_type)) + return type->hasExplicitTimeZone() ? type->getTimeZone().getTimeZone() : std::string(); + if (const auto * type = typeid_cast<const DataTypeDateTime64 *>(&data_type)) + return type->hasExplicitTimeZone() ? type->getTimeZone().getTimeZone() : std::string(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get time zone from type {}", data_type.getName()); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDateTime64.h b/contrib/clickhouse/src/DataTypes/DataTypeDateTime64.h new file mode 100644 index 00000000000..7663518807f --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDateTime64.h @@ -0,0 +1,50 @@ +#pragma once + +#include <Core/Types.h> +#include <DataTypes/DataTypeDateTime.h> +#include <DataTypes/DataTypeDecimalBase.h> + +class DateLUTImpl; + +namespace DB +{ + +/** DateTime64 is same as DateTime, but it stores values as Int64 and has configurable sub-second part. + * + * `scale` determines number of decimal places for sub-second part of the DateTime64. + */ +class DataTypeDateTime64 final : public DataTypeDecimalBase<DateTime64>, public TimezoneMixin +{ +public: + using Base = DataTypeDecimalBase<DateTime64>; + static constexpr UInt8 default_scale = 3; + + static constexpr auto family_name = "DateTime64"; + static constexpr auto type_id = TypeIndex::DateTime64; + + explicit DataTypeDateTime64(UInt32 scale_, const std::string & time_zone_name = ""); + + // reuse timezone from other DateTime/DateTime64 + DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info); + + const char * getFamilyName() const override { return family_name; } + String getSQLCompatibleName() const override { return "DATETIME"; } + std::string doGetName() const override; + TypeIndex getTypeId() const override { return type_id; } + + bool equals(const IDataType & rhs) const override; + + bool canBePromoted() const override { return false; } + + bool canBeUsedAsVersion() const override { return true; } + + bool isSummable() const override { return false; } + +protected: + SerializationPtr doGetDefaultSerialization() const override; +}; + +std::string getDateTimeTimezone(const IDataType & data_type); + +} + diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDecimalBase.cpp b/contrib/clickhouse/src/DataTypes/DataTypeDecimalBase.cpp new file mode 100644 index 00000000000..62218694924 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDecimalBase.cpp @@ -0,0 +1,47 @@ +#include <DataTypes/DataTypeDecimalBase.h> +#include <Interpreters/Context.h> +#include <type_traits> + +namespace DB +{ + +namespace ErrorCodes +{ +} + +bool decimalCheckComparisonOverflow(ContextPtr context) +{ + return context->getSettingsRef().decimal_check_overflow; +} +bool decimalCheckArithmeticOverflow(ContextPtr context) +{ + return context->getSettingsRef().decimal_check_overflow; +} + +template <is_decimal T> +Field DataTypeDecimalBase<T>::getDefault() const +{ + return DecimalField(T(0), scale); +} + +template <is_decimal T> +MutableColumnPtr DataTypeDecimalBase<T>::createColumn() const +{ + return ColumnType::create(0, scale); +} + +template <is_decimal T> +T DataTypeDecimalBase<T>::getScaleMultiplier(UInt32 scale_) +{ + return DecimalUtils::scaleMultiplier<typename T::NativeType>(scale_); +} + + +/// Explicit template instantiations. +template class DataTypeDecimalBase<Decimal32>; +template class DataTypeDecimalBase<Decimal64>; +template class DataTypeDecimalBase<Decimal128>; +template class DataTypeDecimalBase<Decimal256>; +template class DataTypeDecimalBase<DateTime64>; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDecimalBase.h b/contrib/clickhouse/src/DataTypes/DataTypeDecimalBase.h new file mode 100644 index 00000000000..adbe9c95b14 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDecimalBase.h @@ -0,0 +1,210 @@ +#pragma once + +#include <cmath> +#include <type_traits> + +#include <Core/TypeId.h> +#include <Core/DecimalFunctions.h> +#include <Columns/ColumnDecimal.h> +#include <DataTypes/IDataType.h> +#include <DataTypes/DataTypesNumber.h> +#include <Interpreters/Context_fwd.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; +} + +bool decimalCheckComparisonOverflow(ContextPtr context); +bool decimalCheckArithmeticOverflow(ContextPtr context); + +inline UInt32 leastDecimalPrecisionFor(TypeIndex int_type) +{ + switch (int_type) + { + case TypeIndex::Int8: [[fallthrough]]; + case TypeIndex::UInt8: + return 3; + case TypeIndex::Int16: [[fallthrough]]; + case TypeIndex::UInt16: + return 5; + case TypeIndex::Int32: [[fallthrough]]; + case TypeIndex::UInt32: + return 10; + case TypeIndex::Int64: + return 19; + case TypeIndex::UInt64: + return 20; + default: + break; + } + return 0; +} + +/// Base class for decimals, like Decimal(P, S), where P is precision, S is scale. +/// Maximum precisions for underlying types are: +/// Int32 9 +/// Int64 18 +/// Int128 38 +/// Int256 76 +/// Operation between two decimals leads to Decimal(P, S), where +/// P is one of (9, 18, 38, 76); equals to the maximum precision for the biggest underlying type of operands. +/// S is maximum scale of operands. The allowed valuas are [0, precision] +template <is_decimal T> +class DataTypeDecimalBase : public IDataType +{ +public: + using FieldType = T; + using ColumnType = ColumnDecimal<T>; + static constexpr auto type_id = TypeToTypeIndex<T>; + + static constexpr bool is_parametric = true; + + static constexpr size_t maxPrecision() { return DecimalUtils::max_precision<T>; } + + DataTypeDecimalBase(UInt32 precision_, UInt32 scale_) + : precision(precision_), + scale(scale_) + { + if (unlikely(precision < 1 || precision > maxPrecision())) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "Precision {} is out of bounds (precision range: [1, {}])", + std::to_string(precision), maxPrecision()); + if (unlikely(scale > maxPrecision())) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale {} is out of bounds (max scale: {})", + std::to_string(scale), maxPrecision()); + } + + TypeIndex getTypeId() const override { return TypeToTypeIndex<T>; } + + Field getDefault() const override; + MutableColumnPtr createColumn() const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return false; } + bool shouldAlignRightInPrettyFormats() const override { return true; } + bool textCanContainOnlyValidUTF8() const override { return true; } + bool isComparable() const override { return true; } + bool isValueRepresentedByNumber() const override { return true; } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + size_t getSizeOfValueInMemory() const override { return sizeof(T); } + + bool isSummable() const override { return true; } + bool canBeUsedInBooleanContext() const override { return true; } + bool canBeInsideNullable() const override { return true; } + + /// Decimal specific + + UInt32 getPrecision() const { return precision; } + UInt32 getScale() const { return scale; } + T getScaleMultiplier() const { return getScaleMultiplier(scale); } + + T wholePart(T x) const + { + return DecimalUtils::getWholePart(x, scale); + } + + T fractionalPart(T x) const + { + return DecimalUtils::getFractionalPart(x, scale); + } + + T maxWholeValue() const { return getScaleMultiplier(precision - scale) - T(1); } + + template <typename U> + bool canStoreWhole(U x) const + { + static_assert(is_signed_v<typename T::NativeType>); + T max = maxWholeValue(); + if constexpr (is_signed_v<U>) + return -max.value <= x && x <= max.value; + else + return x <= static_cast<make_unsigned_t<typename T::NativeType>>(max.value); + } + + /// @returns multiplier for U to become T with correct scale + template <typename U> + T scaleFactorFor(const DataTypeDecimalBase<U> & x, bool) const + { + if (getScale() < x.getScale()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Decimal result's scale is less than argument's one"); + UInt32 scale_delta = getScale() - x.getScale(); /// scale_delta >= 0 + return getScaleMultiplier(scale_delta); + } + + template <typename U> + T scaleFactorFor(const DataTypeNumber<U> & , bool is_multiply_or_divisor) const + { + if (is_multiply_or_divisor) + return T(1); + return getScaleMultiplier(); + } + + static T getScaleMultiplier(UInt32 scale); + + inline DecimalUtils::DataTypeDecimalTrait<T> getTrait() const + { + return {precision, scale}; + } + +protected: + const UInt32 precision; + const UInt32 scale; +}; + + +template <typename T> +inline const DataTypeDecimalBase<T> * checkDecimalBase(const IDataType & data_type) +{ + if (isColumnedAsDecimalT<T>(data_type)) + return static_cast<const DataTypeDecimalBase<T> *>(&data_type); + + return nullptr; +} + +template <bool is_multiply, bool is_division, typename T, typename U, template <typename> typename DecimalType> +inline auto decimalResultType(const DecimalType<T> & tx, const DecimalType<U> & ty) +{ + const auto result_trait = DecimalUtils::binaryOpResult<is_multiply, is_division>(tx, ty); + return DecimalType<typename decltype(result_trait)::FieldType>(result_trait.precision, result_trait.scale); +} + +template <bool is_multiply, bool is_division, typename T, typename U, template <typename> typename DecimalType> +inline DecimalType<T> decimalResultType(const DecimalType<T> & tx, const DataTypeNumber<U> & ty) +{ + const auto result_trait = DecimalUtils::binaryOpResult<is_multiply, is_division>(tx, ty); + return DecimalType<typename decltype(result_trait)::FieldType>(result_trait.precision, result_trait.scale); +} + +template <bool is_multiply, bool is_division, typename T, typename U, template <typename> typename DecimalType> +inline DecimalType<U> decimalResultType(const DataTypeNumber<T> & tx, const DecimalType<U> & ty) +{ + const auto result_trait = DecimalUtils::binaryOpResult<is_multiply, is_division>(tx, ty); + return DecimalType<typename decltype(result_trait)::FieldType>(result_trait.precision, result_trait.scale); +} + +template <template <typename> typename DecimalType> +inline DataTypePtr createDecimal(UInt64 precision_value, UInt64 scale_value) +{ + if (precision_value < DecimalUtils::min_precision || precision_value > DecimalUtils::max_precision<Decimal256>) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Wrong precision: it must be between {} and {}, got {}", + DecimalUtils::min_precision, DecimalUtils::max_precision<Decimal256>, precision_value); + + if (static_cast<UInt64>(scale_value) > precision_value) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Negative scales and scales larger than precision are not supported"); + + if (precision_value <= DecimalUtils::max_precision<Decimal32>) + return std::make_shared<DecimalType<Decimal32>>(precision_value, scale_value); + else if (precision_value <= DecimalUtils::max_precision<Decimal64>) + return std::make_shared<DecimalType<Decimal64>>(precision_value, scale_value); + else if (precision_value <= DecimalUtils::max_precision<Decimal128>) + return std::make_shared<DecimalType<Decimal128>>(precision_value, scale_value); + return std::make_shared<DecimalType<Decimal256>>(precision_value, scale_value); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeDomainBool.cpp b/contrib/clickhouse/src/DataTypes/DataTypeDomainBool.cpp new file mode 100644 index 00000000000..245c5495299 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeDomainBool.cpp @@ -0,0 +1,21 @@ +#include <DataTypes/Serializations/SerializationBool.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/DataTypeCustom.h> + +namespace DB +{ + +void registerDataTypeDomainBool(DataTypeFactory & factory) +{ + factory.registerSimpleDataTypeCustom("Bool", [] + { + auto type = DataTypeFactory::instance().get("UInt8"); + return std::make_pair(type, std::make_unique<DataTypeCustomDesc>( + std::make_unique<DataTypeCustomFixedName>("Bool"), std::make_unique<SerializationBool>(type->getDefaultSerialization()))); + }); + + factory.registerAlias("bool", "Bool", DataTypeFactory::CaseInsensitive); + factory.registerAlias("boolean", "Bool", DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeEnum.cpp b/contrib/clickhouse/src/DataTypes/DataTypeEnum.cpp new file mode 100644 index 00000000000..1750ae785bf --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeEnum.cpp @@ -0,0 +1,348 @@ +#include <IO/WriteBufferFromString.h> +#include <DataTypes/DataTypeEnum.h> +#include <DataTypes/Serializations/SerializationEnum.h> +#include <DataTypes/DataTypeFactory.h> +#include <Parsers/IAST.h> +#include <Parsers/ASTFunction.h> +#include <Parsers/ASTLiteral.h> +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> +#include <Common/UTF8Helpers.h> +#include <Poco/UTF8Encoding.h> + +#include <limits> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_TYPE_OF_FIELD; + extern const int EMPTY_DATA_PASSED; + extern const int UNEXPECTED_AST_STRUCTURE; + extern const int ARGUMENT_OUT_OF_BOUND; +} + + +template <typename FieldType> struct EnumName; +template <> struct EnumName<Int8> { static constexpr auto value = "Enum8"; }; +template <> struct EnumName<Int16> { static constexpr auto value = "Enum16"; }; + + +template <typename Type> +const char * DataTypeEnum<Type>::getFamilyName() const +{ + return EnumName<FieldType>::value; +} + +template <typename Type> +std::string DataTypeEnum<Type>::generateMySQLName(const Values & values) +{ + WriteBufferFromOwnString out; + + writeString("ENUM", out); + writeChar('(', out); + + auto first = true; + for (const auto & name_and_value : values) + { + if (!first) + writeString(", ", out); + + first = false; + + writeQuotedString(name_and_value.first, out); + } + + writeChar(')', out); + + return out.str(); +} + +template <typename Type> +std::string DataTypeEnum<Type>::generateName(const Values & values) +{ + WriteBufferFromOwnString out; + + writeString(EnumName<FieldType>::value, out); + writeChar('(', out); + + auto first = true; + for (const auto & name_and_value : values) + { + if (!first) + writeString(", ", out); + + first = false; + + writeQuotedString(name_and_value.first, out); + writeString(" = ", out); + writeText(name_and_value.second, out); + } + + writeChar(')', out); + + return out.str(); +} + +template <typename Type> +DataTypeEnum<Type>::DataTypeEnum(const Values & values_) + : EnumValues<Type>(values_) + , type_name(generateName(this->getValues())) +{ +} + +template <typename Type> +Field DataTypeEnum<Type>::getDefault() const +{ + return this->getValues().front().second; +} + +template <typename Type> +void DataTypeEnum<Type>::insertDefaultInto(IColumn & column) const +{ + assert_cast<ColumnType &>(column).getData().push_back(this->getValues().front().second); +} + +template <typename Type> +bool DataTypeEnum<Type>::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this) && type_name == static_cast<const DataTypeEnum<Type> &>(rhs).type_name; +} + + +template <typename Type> +bool DataTypeEnum<Type>::textCanContainOnlyValidUTF8() const +{ + for (const auto & elem : this->getValues()) + { + const char * pos = elem.first.data(); + const char * end = pos + elem.first.size(); + while (pos < end) + { + size_t length = UTF8::seqLength(*pos); + if (pos + length > end) + return false; + + if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(pos), static_cast<int>(length))) + pos += length; + else + return false; + } + } + return true; +} + +template <typename Type> +static void checkOverflow(Int64 value) +{ + if (!(std::numeric_limits<Type>::min() <= value && value <= std::numeric_limits<Type>::max())) + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "DataTypeEnum: Unexpected value {}", toString(value)); +} + +template <typename Type> +Field DataTypeEnum<Type>::castToName(const Field & value_or_name) const +{ + if (value_or_name.getType() == Field::Types::String) + { + this->getValue(value_or_name.get<String>()); /// Check correctness + return value_or_name.get<String>(); + } + else if (value_or_name.getType() == Field::Types::Int64) + { + Int64 value = value_or_name.get<Int64>(); + checkOverflow<Type>(value); + return this->getNameForValue(static_cast<Type>(value)).toString(); + } + else + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, + "DataTypeEnum: Unsupported type of field {}", value_or_name.getTypeName()); +} + +template <typename Type> +Field DataTypeEnum<Type>::castToValue(const Field & value_or_name) const +{ + if (value_or_name.getType() == Field::Types::String) + { + return this->getValue(value_or_name.get<String>()); + } + else if (value_or_name.getType() == Field::Types::Int64 + || value_or_name.getType() == Field::Types::UInt64) + { + Int64 value = value_or_name.get<Int64>(); + checkOverflow<Type>(value); + this->getNameForValue(static_cast<Type>(value)); /// Check correctness + return value; + } + else + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, + "DataTypeEnum: Unsupported type of field {}", value_or_name.getTypeName()); +} + + +template <typename Type> +bool DataTypeEnum<Type>::contains(const IDataType & rhs) const +{ + if (const auto * rhs_enum8 = typeid_cast<const DataTypeEnum8 *>(&rhs)) + return this->containsAll(rhs_enum8->getValues()); + if (const auto * rhs_enum16 = typeid_cast<const DataTypeEnum16 *>(&rhs)) + return this->containsAll(rhs_enum16->getValues()); + return false; +} + +template <typename Type> +SerializationPtr DataTypeEnum<Type>::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationEnum<Type>>(this->getValues()); +} + + +/// Explicit instantiations. +template class DataTypeEnum<Int8>; +template class DataTypeEnum<Int16>; + +static void checkASTStructure(const ASTPtr & child) +{ + const auto * func = child->as<ASTFunction>(); + if (!func + || func->name != "equals" + || func->parameters + || !func->arguments + || func->arguments->children.size() != 2) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Elements of Enum data type must be of form: " + "'name' = number, where name is string literal and number is an integer"); +} + +static void autoAssignNumberForEnum(const ASTPtr & arguments) +{ + Int64 literal_child_assign_num = 1; + ASTs assign_number_child; + assign_number_child.reserve(arguments->children.size()); + bool is_first_child = true; + size_t assign_count= 0; + + for (const ASTPtr & child : arguments->children) + { + if (child->as<ASTLiteral>()) + { + assign_count += !is_first_child; + ASTPtr func = makeASTFunction("equals", child, std::make_shared<ASTLiteral>(literal_child_assign_num + assign_count)); + assign_number_child.emplace_back(func); + } + else if (child->as<ASTFunction>()) + { + if (is_first_child) + { + checkASTStructure(child); + const auto * func = child->as<ASTFunction>(); + const auto * value_literal = func->arguments->children[1]->as<ASTLiteral>(); + + if (!value_literal + || (value_literal->value.getType() != Field::Types::UInt64 && value_literal->value.getType() != Field::Types::Int64)) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Elements of Enum data type must be of form: " + "'name' = number or 'name', where name is string literal and number is an integer"); + + literal_child_assign_num = value_literal->value.get<Int64>(); + } + assign_number_child.emplace_back(child); + } + else + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Elements of Enum data type must be of form: " + "'name' = number or 'name', where name is string literal and number is an integer"); + + is_first_child = false; + } + + if (assign_count != 0 && assign_count != arguments->children.size() - 1) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "All elements of Enum data type must be of form: " + "'name' = number or 'name', where name is string literal and number is an integer"); + + arguments->children = assign_number_child; +} + +template <typename DataTypeEnum> +static DataTypePtr createExact(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Enum data type cannot be empty"); + + typename DataTypeEnum::Values values; + values.reserve(arguments->children.size()); + + using FieldType = typename DataTypeEnum::FieldType; + + autoAssignNumberForEnum(arguments); + /// Children must be functions 'equals' with string literal as left argument and numeric literal as right argument. + for (const ASTPtr & child : arguments->children) + { + checkASTStructure(child); + + const auto * func = child->as<ASTFunction>(); + const auto * name_literal = func->arguments->children[0]->as<ASTLiteral>(); + const auto * value_literal = func->arguments->children[1]->as<ASTLiteral>(); + + if (!name_literal + || !value_literal + || name_literal->value.getType() != Field::Types::String + || (value_literal->value.getType() != Field::Types::UInt64 && value_literal->value.getType() != Field::Types::Int64)) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Elements of Enum data type must be of form: " + "'name' = number or 'name', where name is string literal and number is an integer"); + + const String & field_name = name_literal->value.get<String>(); + const auto value = value_literal->value.get<FieldType>(); + + if (value > std::numeric_limits<FieldType>::max() || value < std::numeric_limits<FieldType>::min()) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Value {} for element '{}' exceeds range of {}", + toString(value), field_name, EnumName<FieldType>::value); + + values.emplace_back(field_name, value); + } + + return std::make_shared<DataTypeEnum>(values); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Enum data type cannot be empty"); + + autoAssignNumberForEnum(arguments); + /// Children must be functions 'equals' with string literal as left argument and numeric literal as right argument. + for (const ASTPtr & child : arguments->children) + { + checkASTStructure(child); + + const auto * func = child->as<ASTFunction>(); + const auto * value_literal = func->arguments->children[1]->as<ASTLiteral>(); + + if (!value_literal + || (value_literal->value.getType() != Field::Types::UInt64 && value_literal->value.getType() != Field::Types::Int64)) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Elements of Enum data type must be of form: " + "'name' = number or 'name', where name is string literal and number is an integer"); + + Int64 value = value_literal->value.get<Int64>(); + + if (value > std::numeric_limits<Int8>::max() || value < std::numeric_limits<Int8>::min()) + return createExact<DataTypeEnum16>(arguments); + } + + return createExact<DataTypeEnum8>(arguments); +} + +void registerDataTypeEnum(DataTypeFactory & factory) +{ + factory.registerDataType("Enum8", createExact<DataTypeEnum<Int8>>); + factory.registerDataType("Enum16", createExact<DataTypeEnum<Int16>>); + factory.registerDataType("Enum", create); + + /// MySQL + factory.registerAlias("ENUM", "Enum", DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeEnum.h b/contrib/clickhouse/src/DataTypes/DataTypeEnum.h new file mode 100644 index 00000000000..d148f753c82 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeEnum.h @@ -0,0 +1,93 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <DataTypes/EnumValues.h> +#include <Columns/ColumnVector.h> +#include <Columns/ColumnConst.h> +#include <Common/HashTable/HashMap.h> +#include <vector> +#include <unordered_map> + + +namespace DB +{ + +class IDataTypeEnum : public IDataType +{ +public: + virtual Field castToName(const Field & value_or_name) const = 0; + virtual Field castToValue(const Field & value_or_name) const = 0; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return false; } + bool isValueRepresentedByNumber() const override { return true; } + bool isValueRepresentedByInteger() const override { return true; } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + bool isCategorial() const override { return true; } + bool canBeInsideNullable() const override { return true; } + bool isComparable() const override { return true; } + + virtual bool contains(const IDataType & rhs) const = 0; +}; + + +template <typename Type> +class DataTypeEnum final : public IDataTypeEnum, public EnumValues<Type> +{ +public: + using FieldType = Type; + using ColumnType = ColumnVector<FieldType>; + static constexpr auto type_id = sizeof(FieldType) == 1 ? TypeIndex::Enum8 : TypeIndex::Enum16; + using typename EnumValues<Type>::Values; + + static constexpr bool is_parametric = true; + +private: + std::string type_name; + static std::string generateName(const Values & values); + static std::string generateMySQLName(const Values & values); + +public: + explicit DataTypeEnum(const Values & values_); + + std::string doGetName() const override { return type_name; } + const char * getFamilyName() const override; + String getSQLCompatibleName() const override { return generateMySQLName(this->getValues()); } + + TypeIndex getTypeId() const override { return type_id; } + + FieldType readValue(ReadBuffer & istr) const + { + FieldType x; + readText(x, istr); + return this->findByValue(x)->first; + } + + Field castToName(const Field & value_or_name) const override; + Field castToValue(const Field & value_or_name) const override; + + MutableColumnPtr createColumn() const override { return ColumnType::create(); } + + Field getDefault() const override; + void insertDefaultInto(IColumn & column) const override; + + bool equals(const IDataType & rhs) const override; + + bool textCanContainOnlyValidUTF8() const override; + size_t getSizeOfValueInMemory() const override { return sizeof(FieldType); } + + /// Check current Enum type extends another Enum type (contains all the same values and doesn't override name's with other values) + /// Example: + /// Enum('a' = 1, 'b' = 2) -> Enum('c' = 1, 'b' = 2, 'd' = 3) OK + /// Enum('a' = 1, 'b' = 2) -> Enum('a' = 2, 'b' = 1) NOT OK + bool contains(const IDataType & rhs) const override; + + SerializationPtr doGetDefaultSerialization() const override; +}; + + +using DataTypeEnum8 = DataTypeEnum<Int8>; +using DataTypeEnum16 = DataTypeEnum<Int16>; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeFactory.cpp b/contrib/clickhouse/src/DataTypes/DataTypeFactory.cpp new file mode 100644 index 00000000000..415f24d8151 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeFactory.cpp @@ -0,0 +1,301 @@ +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/DataTypeCustom.h> +#include <Parsers/parseQuery.h> +#include <Parsers/ParserCreateQuery.h> +#include <Parsers/ASTFunction.h> +#include <Parsers/ASTIdentifier.h> +#include <Parsers/ASTLiteral.h> +#include <Common/typeid_cast.h> +#include <Poco/String.h> +#include <Common/StringUtils/StringUtils.h> +#include <IO/WriteHelpers.h> +#include <Core/Defines.h> +#include <Common/CurrentThread.h> +#include <Interpreters/Context.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TYPE; + extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE; + extern const int UNEXPECTED_AST_STRUCTURE; + extern const int DATA_TYPE_CANNOT_HAVE_ARGUMENTS; +} + +DataTypePtr DataTypeFactory::get(const String & full_name) const +{ + return getImpl<false>(full_name); +} + +DataTypePtr DataTypeFactory::tryGet(const String & full_name) const +{ + return getImpl<true>(full_name); +} + +template <bool nullptr_on_error> +DataTypePtr DataTypeFactory::getImpl(const String & full_name) const +{ + /// Data type parser can be invoked from coroutines with small stack. + /// Value 315 is known to cause stack overflow in some test configurations (debug build, sanitizers) + /// let's make the threshold significantly lower. + /// It is impractical for user to have complex data types with this depth. + +#if defined(SANITIZER) || !defined(NDEBUG) + static constexpr size_t data_type_max_parse_depth = 150; +#else + static constexpr size_t data_type_max_parse_depth = 300; +#endif + + ParserDataType parser; + ASTPtr ast; + if constexpr (nullptr_on_error) + { + String out_err; + const char * start = full_name.data(); + ast = tryParseQuery(parser, start, start + full_name.size(), out_err, false, "data type", false, DBMS_DEFAULT_MAX_QUERY_SIZE, data_type_max_parse_depth); + if (!ast) + return nullptr; + } + else + { + ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", false, data_type_max_parse_depth); + } + + return getImpl<nullptr_on_error>(ast); +} + +DataTypePtr DataTypeFactory::get(const ASTPtr & ast) const +{ + return getImpl<false>(ast); +} + +DataTypePtr DataTypeFactory::tryGet(const ASTPtr & ast) const +{ + return getImpl<true>(ast); +} + +template <bool nullptr_on_error> +DataTypePtr DataTypeFactory::getImpl(const ASTPtr & ast) const +{ + if (const auto * func = ast->as<ASTFunction>()) + { + if (func->parameters) + { + if constexpr (nullptr_on_error) + return nullptr; + throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_DATA_TYPE, "Data type cannot have multiple parenthesized parameters."); + } + return getImpl<nullptr_on_error>(func->name, func->arguments); + } + + if (const auto * ident = ast->as<ASTIdentifier>()) + { + return getImpl<nullptr_on_error>(ident->name(), {}); + } + + if (const auto * lit = ast->as<ASTLiteral>()) + { + if (lit->value.isNull()) + return getImpl<nullptr_on_error>("Null", {}); + } + + if constexpr (nullptr_on_error) + return nullptr; + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST element for data type."); +} + +DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr & parameters) const +{ + return getImpl<false>(family_name_param, parameters); +} + +DataTypePtr DataTypeFactory::tryGet(const String & family_name_param, const ASTPtr & parameters) const +{ + return getImpl<true>(family_name_param, parameters); +} + +template <bool nullptr_on_error> +DataTypePtr DataTypeFactory::getImpl(const String & family_name_param, const ASTPtr & parameters) const +{ + String family_name = getAliasToOrName(family_name_param); + + if (endsWith(family_name, "WithDictionary")) + { + ASTPtr low_cardinality_params = std::make_shared<ASTExpressionList>(); + String param_name = family_name.substr(0, family_name.size() - strlen("WithDictionary")); + if (parameters) + { + auto func = std::make_shared<ASTFunction>(); + func->name = param_name; + func->arguments = parameters; + low_cardinality_params->children.push_back(func); + } + else + low_cardinality_params->children.push_back(std::make_shared<ASTIdentifier>(param_name)); + + return getImpl<nullptr_on_error>("LowCardinality", low_cardinality_params); + } + + const auto * creator = findCreatorByName<nullptr_on_error>(family_name); + if constexpr (nullptr_on_error) + { + if (!creator) + return nullptr; + + try + { + return (*creator)(parameters); + } + catch (...) + { + return nullptr; + } + } + else + { + assert(creator); + return (*creator)(parameters); + } +} + +DataTypePtr DataTypeFactory::getCustom(DataTypeCustomDescPtr customization) const +{ + if (!customization->name) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create custom type without name"); + + auto type = get(customization->name->getName()); + type->setCustomization(std::move(customization)); + return type; +} + + +void DataTypeFactory::registerDataType(const String & family_name, Value creator, CaseSensitiveness case_sensitiveness) +{ + if (creator == nullptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataTypeFactory: the data type family {} has been provided a null constructor", family_name); + + String family_name_lowercase = Poco::toLower(family_name); + + if (isAlias(family_name) || isAlias(family_name_lowercase)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataTypeFactory: the data type family name '{}' is already registered as alias", family_name); + + if (!data_types.emplace(family_name, creator).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataTypeFactory: the data type family name '{}' is not unique", + family_name); + + if (case_sensitiveness == CaseInsensitive + && !case_insensitive_data_types.emplace(family_name_lowercase, creator).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataTypeFactory: the case insensitive data type family name '{}' is not unique", family_name); +} + +void DataTypeFactory::registerSimpleDataType(const String & name, SimpleCreator creator, CaseSensitiveness case_sensitiveness) +{ + if (creator == nullptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataTypeFactory: the data type {} has been provided a null constructor", + name); + + registerDataType(name, [name, creator](const ASTPtr & ast) + { + if (ast) + throw Exception(ErrorCodes::DATA_TYPE_CANNOT_HAVE_ARGUMENTS, "Data type {} cannot have arguments", name); + return creator(); + }, case_sensitiveness); +} + +void DataTypeFactory::registerDataTypeCustom(const String & family_name, CreatorWithCustom creator, CaseSensitiveness case_sensitiveness) +{ + registerDataType(family_name, [creator](const ASTPtr & ast) + { + auto res = creator(ast); + res.first->setCustomization(std::move(res.second)); + + return res.first; + }, case_sensitiveness); +} + +void DataTypeFactory::registerSimpleDataTypeCustom(const String & name, SimpleCreatorWithCustom creator, CaseSensitiveness case_sensitiveness) +{ + registerDataTypeCustom(name, [name, creator](const ASTPtr & ast) + { + if (ast) + throw Exception(ErrorCodes::DATA_TYPE_CANNOT_HAVE_ARGUMENTS, "Data type {} cannot have arguments", name); + return creator(); + }, case_sensitiveness); +} + +template <bool nullptr_on_error> +const DataTypeFactory::Value * DataTypeFactory::findCreatorByName(const String & family_name) const +{ + ContextPtr query_context; + if (CurrentThread::isInitialized()) + query_context = CurrentThread::get().getQueryContext(); + { + DataTypesDictionary::const_iterator it = data_types.find(family_name); + if (data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name); + return &it->second; + } + } + + String family_name_lowercase = Poco::toLower(family_name); + + { + DataTypesDictionary::const_iterator it = case_insensitive_data_types.find(family_name_lowercase); + if (case_insensitive_data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name_lowercase); + return &it->second; + } + } + + if constexpr (nullptr_on_error) + return nullptr; + + auto hints = this->getHints(family_name); + if (!hints.empty()) + throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown data type family: {}. Maybe you meant: {}", family_name, toString(hints)); + else + throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown data type family: {}", family_name); +} + +DataTypeFactory::DataTypeFactory() +{ + registerDataTypeNumbers(*this); + registerDataTypeDecimal(*this); + registerDataTypeDate(*this); + registerDataTypeDate32(*this); + registerDataTypeDateTime(*this); + registerDataTypeString(*this); + registerDataTypeFixedString(*this); + registerDataTypeEnum(*this); + registerDataTypeArray(*this); + registerDataTypeTuple(*this); + registerDataTypeNullable(*this); + registerDataTypeNothing(*this); + registerDataTypeUUID(*this); + registerDataTypeIPv4andIPv6(*this); + registerDataTypeAggregateFunction(*this); + registerDataTypeNested(*this); + registerDataTypeInterval(*this); + registerDataTypeLowCardinality(*this); + registerDataTypeDomainBool(*this); + registerDataTypeDomainSimpleAggregateFunction(*this); + registerDataTypeDomainGeo(*this); + registerDataTypeMap(*this); + registerDataTypeObject(*this); +} + +DataTypeFactory & DataTypeFactory::instance() +{ + static DataTypeFactory ret; + return ret; +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeFactory.h b/contrib/clickhouse/src/DataTypes/DataTypeFactory.h new file mode 100644 index 00000000000..ba7c1a3d7fe --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeFactory.h @@ -0,0 +1,104 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <Parsers/IAST_fwd.h> +#include <Common/IFactoryWithAliases.h> +#include <DataTypes/DataTypeCustom.h> + + +#include <functional> +#include <memory> +#include <unordered_map> + + +namespace DB +{ + +class IDataType; +using DataTypePtr = std::shared_ptr<const IDataType>; + + +/** Creates a data type by name of data type family and parameters. + */ +class DataTypeFactory final : private boost::noncopyable, public IFactoryWithAliases<std::function<DataTypePtr(const ASTPtr & parameters)>> +{ +private: + using SimpleCreator = std::function<DataTypePtr()>; + using DataTypesDictionary = std::unordered_map<String, Value>; + using CreatorWithCustom = std::function<std::pair<DataTypePtr, DataTypeCustomDescPtr>(const ASTPtr & parameters)>; + using SimpleCreatorWithCustom = std::function<std::pair<DataTypePtr,DataTypeCustomDescPtr>()>; + +public: + static DataTypeFactory & instance(); + + DataTypePtr get(const String & full_name) const; + DataTypePtr get(const String & family_name, const ASTPtr & parameters) const; + DataTypePtr get(const ASTPtr & ast) const; + DataTypePtr getCustom(DataTypeCustomDescPtr customization) const; + + /// Return nullptr in case of error. + DataTypePtr tryGet(const String & full_name) const; + DataTypePtr tryGet(const String & family_name, const ASTPtr & parameters) const; + DataTypePtr tryGet(const ASTPtr & ast) const; + + /// Register a type family by its name. + void registerDataType(const String & family_name, Value creator, CaseSensitiveness case_sensitiveness = CaseSensitive); + + /// Register a simple data type, that have no parameters. + void registerSimpleDataType(const String & name, SimpleCreator creator, CaseSensitiveness case_sensitiveness = CaseSensitive); + + /// Register a customized type family + void registerDataTypeCustom(const String & family_name, CreatorWithCustom creator, CaseSensitiveness case_sensitiveness = CaseSensitive); + + /// Register a simple customized data type + void registerSimpleDataTypeCustom(const String & name, SimpleCreatorWithCustom creator, CaseSensitiveness case_sensitiveness = CaseSensitive); + +private: + template <bool nullptr_on_error> + DataTypePtr getImpl(const String & full_name) const; + template <bool nullptr_on_error> + DataTypePtr getImpl(const String & family_name, const ASTPtr & parameters) const; + template <bool nullptr_on_error> + DataTypePtr getImpl(const ASTPtr & ast) const; + template <bool nullptr_on_error> + const Value * findCreatorByName(const String & family_name) const; + + DataTypesDictionary data_types; + + /// Case insensitive data types will be additionally added here with lowercased name. + DataTypesDictionary case_insensitive_data_types; + + DataTypeFactory(); + + const DataTypesDictionary & getMap() const override { return data_types; } + + const DataTypesDictionary & getCaseInsensitiveMap() const override { return case_insensitive_data_types; } + + String getFactoryName() const override { return "DataTypeFactory"; } +}; + +void registerDataTypeNumbers(DataTypeFactory & factory); +void registerDataTypeDecimal(DataTypeFactory & factory); +void registerDataTypeDate(DataTypeFactory & factory); +void registerDataTypeDate32(DataTypeFactory & factory); +void registerDataTypeDateTime(DataTypeFactory & factory); +void registerDataTypeString(DataTypeFactory & factory); +void registerDataTypeFixedString(DataTypeFactory & factory); +void registerDataTypeEnum(DataTypeFactory & factory); +void registerDataTypeArray(DataTypeFactory & factory); +void registerDataTypeTuple(DataTypeFactory & factory); +void registerDataTypeMap(DataTypeFactory & factory); +void registerDataTypeNullable(DataTypeFactory & factory); +void registerDataTypeNothing(DataTypeFactory & factory); +void registerDataTypeUUID(DataTypeFactory & factory); +void registerDataTypeIPv4andIPv6(DataTypeFactory & factory); +void registerDataTypeAggregateFunction(DataTypeFactory & factory); +void registerDataTypeNested(DataTypeFactory & factory); +void registerDataTypeInterval(DataTypeFactory & factory); +void registerDataTypeLowCardinality(DataTypeFactory & factory); +void registerDataTypeDomainBool(DataTypeFactory & factory); +void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); +void registerDataTypeDomainGeo(DataTypeFactory & factory); +void registerDataTypeObject(DataTypeFactory & factory); + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeFixedString.cpp b/contrib/clickhouse/src/DataTypes/DataTypeFixedString.cpp new file mode 100644 index 00000000000..85af59e852d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeFixedString.cpp @@ -0,0 +1,70 @@ +#include <Columns/ColumnFixedString.h> + +#include <DataTypes/DataTypeFixedString.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationFixedString.h> + +#include <Parsers/IAST.h> +#include <Parsers/ASTLiteral.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNEXPECTED_AST_STRUCTURE; +} + + +std::string DataTypeFixedString::doGetName() const +{ + return "FixedString(" + toString(n) + ")"; +} + +MutableColumnPtr DataTypeFixedString::createColumn() const +{ + return ColumnFixedString::create(n); +} + +Field DataTypeFixedString::getDefault() const +{ + return String(); +} + +bool DataTypeFixedString::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this) && n == static_cast<const DataTypeFixedString &>(rhs).n; +} + +SerializationPtr DataTypeFixedString::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationFixedString>(n); +} + + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "FixedString data type family must have exactly one argument - size in bytes"); + + const auto * argument = arguments->children[0]->as<ASTLiteral>(); + if (!argument || argument->value.getType() != Field::Types::UInt64 || argument->value.get<UInt64>() == 0) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "FixedString data type family must have a number (positive integer) as its argument"); + + return std::make_shared<DataTypeFixedString>(argument->value.get<UInt64>()); +} + + +void registerDataTypeFixedString(DataTypeFactory & factory) +{ + factory.registerDataType("FixedString", create); + + /// Compatibility alias. + factory.registerAlias("BINARY", "FixedString", DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeFixedString.h b/contrib/clickhouse/src/DataTypes/DataTypeFixedString.h new file mode 100644 index 00000000000..22ec793208d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeFixedString.h @@ -0,0 +1,78 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <Common/PODArray_fwd.h> +#include <Common/Exception.h> + +#define MAX_FIXEDSTRING_SIZE 0xFFFFFF +#define MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS 256 + + +namespace DB +{ + +class ColumnFixedString; + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; +} + + +class DataTypeFixedString final : public IDataType +{ +private: + size_t n; + +public: + using ColumnType = ColumnFixedString; + + static constexpr bool is_parametric = true; + static constexpr auto type_id = TypeIndex::FixedString; + + explicit DataTypeFixedString(size_t n_) : n(n_) + { + if (n == 0) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "FixedString size must be positive"); + if (n > MAX_FIXEDSTRING_SIZE) + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "FixedString size is too large"); + } + + std::string doGetName() const override; + TypeIndex getTypeId() const override { return type_id; } + + const char * getFamilyName() const override { return "FixedString"; } + /// Use TEXT for compatibility with MySQL to allow arbitrary bytes. + String getSQLCompatibleName() const override { return "TEXT"; } + + size_t getN() const + { + return n; + } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + SerializationPtr doGetDefaultSerialization() const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return false; } + bool isComparable() const override { return true; } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + size_t getSizeOfValueInMemory() const override { return n; } + bool isCategorial() const override { return true; } + bool canBeInsideNullable() const override { return true; } + bool canBeInsideLowCardinality() const override { return true; } + + /// Makes sure that the length of a newly inserted string to `chars` is equal to getN(). + /// If the length is less than getN() the function will add zero characters up to getN(). + /// If the length is greater than getN() the function will throw an exception. + void alignStringLength(PaddedPODArray<UInt8> & chars, size_t old_size) const; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeFunction.cpp b/contrib/clickhouse/src/DataTypes/DataTypeFunction.cpp new file mode 100644 index 00000000000..82f3d7ee515 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeFunction.cpp @@ -0,0 +1,36 @@ +#include <DataTypes/DataTypeFunction.h> +#include <IO/WriteBufferFromString.h> +#include <IO/Operators.h> + + +namespace DB +{ + +std::string DataTypeFunction::doGetName() const +{ + WriteBufferFromOwnString res; + + res << "Function("; + if (argument_types.size() > 1) + res << "("; + for (size_t i = 0; i < argument_types.size(); ++i) + { + if (i > 0) + res << ", "; + const DataTypePtr & type = argument_types[i]; + res << (type ? type->getName() : "?"); + } + if (argument_types.size() > 1) + res << ")"; + res << " -> "; + res << (return_type ? return_type->getName() : "?"); + res << ")"; + return res.str(); +} + +bool DataTypeFunction::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this) && getName() == rhs.getName(); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeFunction.h b/contrib/clickhouse/src/DataTypes/DataTypeFunction.h new file mode 100644 index 00000000000..9acec676ce0 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeFunction.h @@ -0,0 +1,45 @@ +#pragma once + +#include <DataTypes/IDataTypeDummy.h> + + +namespace DB +{ + +/** Special data type, representing lambda expression. + */ +class DataTypeFunction final : public IDataTypeDummy +{ +private: + DataTypes argument_types; + DataTypePtr return_type; + +public: + static constexpr bool is_parametric = true; + bool isParametric() const override { return true; } + + /// Some types could be still unknown. + explicit DataTypeFunction(const DataTypes & argument_types_ = DataTypes(), const DataTypePtr & return_type_ = nullptr) + : argument_types(argument_types_), return_type(return_type_) {} + + std::string doGetName() const override; + const char * getFamilyName() const override { return "Function"; } + String getSQLCompatibleName() const override { return "TEXT"; } + TypeIndex getTypeId() const override { return TypeIndex::Function; } + + const DataTypes & getArgumentTypes() const + { + return argument_types; + } + + const DataTypePtr & getReturnType() const + { + return return_type; + } + + bool equals(const IDataType & rhs) const override; + + bool supportsSparseSerialization() const override { return false; } +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeIPv4andIPv6.cpp b/contrib/clickhouse/src/DataTypes/DataTypeIPv4andIPv6.cpp new file mode 100644 index 00000000000..4c0b45f472a --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeIPv4andIPv6.cpp @@ -0,0 +1,17 @@ +#include <DataTypes/DataTypeIPv4andIPv6.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationIPv4andIPv6.h> + + +namespace DB +{ + +void registerDataTypeIPv4andIPv6(DataTypeFactory & factory) +{ + factory.registerSimpleDataType("IPv4", [] { return DataTypePtr(std::make_shared<DataTypeIPv4>()); }); + factory.registerAlias("INET4", "IPv4", DataTypeFactory::CaseInsensitive); + factory.registerSimpleDataType("IPv6", [] { return DataTypePtr(std::make_shared<DataTypeIPv6>()); }); + factory.registerAlias("INET6", "IPv6", DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeIPv4andIPv6.h b/contrib/clickhouse/src/DataTypes/DataTypeIPv4andIPv6.h new file mode 100644 index 00000000000..487ce04f67c --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeIPv4andIPv6.h @@ -0,0 +1,94 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <Columns/ColumnVector.h> +#include <base/IPv4andIPv6.h> +#include <DataTypes/Serializations/SerializationIPv4andIPv6.h> + + +namespace DB +{ + +class DataTypeIPv4 : public IDataType +{ +public: + static constexpr bool is_parametric = false; + + using FieldType = IPv4; + using ColumnType = ColumnVector<IPv4>; + static constexpr auto type_id = TypeToTypeIndex<IPv4>; + + const char * getFamilyName() const override { return TypeName<IPv4>.data(); } + String getSQLCompatibleName() const override { return "TEXT"; } + + TypeIndex getTypeId() const override { return type_id; } + + Field getDefault() const override { return IPv4{}; } + + MutableColumnPtr createColumn() const override {return ColumnVector<IPv4>::create();} + + bool isParametric() const override { return false; } + bool haveSubtypes() const override { return false; } + + bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } + + bool canBeUsedInBitOperations() const override { return true; } + bool canBeInsideNullable() const override { return true; } + bool canBePromoted() const override { return false; } + bool shouldAlignRightInPrettyFormats() const override { return false; } + bool textCanContainOnlyValidUTF8() const override { return true; } + bool isComparable() const override { return true; } + bool isValueRepresentedByNumber() const override { return true; } + bool isValueRepresentedByInteger() const override { return true; } + bool isValueRepresentedByUnsignedInteger() const override { return true; } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + size_t getSizeOfValueInMemory() const override { return sizeof(IPv4); } + bool isCategorial() const override { return true; } + bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { return std::make_shared<SerializationIP<IPv4>>(); } +}; + +class DataTypeIPv6 : public IDataType +{ +public: + static constexpr bool is_parametric = false; + + using FieldType = IPv6; + using ColumnType = ColumnVector<IPv6>; + static constexpr auto type_id = TypeToTypeIndex<IPv6>; + + const char * getFamilyName() const override { return TypeName<IPv6>.data(); } + String getSQLCompatibleName() const override { return "TEXT"; } + + TypeIndex getTypeId() const override { return type_id; } + + Field getDefault() const override { return IPv6{}; } + + MutableColumnPtr createColumn() const override {return ColumnVector<IPv6>::create();} + + bool isParametric() const override { return false; } + bool haveSubtypes() const override { return false; } + + bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } + + bool canBeUsedInBitOperations() const override { return true; } + bool canBeInsideNullable() const override { return true; } + bool canBePromoted() const override { return false; } + bool shouldAlignRightInPrettyFormats() const override { return false; } + bool textCanContainOnlyValidUTF8() const override { return true; } + bool isComparable() const override { return true; } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + size_t getSizeOfValueInMemory() const override { return sizeof(IPv6); } + bool isCategorial() const override { return true; } + bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { return std::make_shared<SerializationIP<IPv6>>(); } +}; + + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeInterval.cpp b/contrib/clickhouse/src/DataTypes/DataTypeInterval.cpp new file mode 100644 index 00000000000..f8fe8bb3b4b --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeInterval.cpp @@ -0,0 +1,31 @@ +#include <DataTypes/DataTypeInterval.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationInterval.h> + + +namespace DB +{ + +SerializationPtr DataTypeInterval::doGetDefaultSerialization() const { return std::make_shared<SerializationInterval>(kind); } + +bool DataTypeInterval::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this) && kind == static_cast<const DataTypeInterval &>(rhs).kind; +} + +void registerDataTypeInterval(DataTypeFactory & factory) +{ + factory.registerSimpleDataType("IntervalNanosecond", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Nanosecond)); }); + factory.registerSimpleDataType("IntervalMicrosecond", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Microsecond)); }); + factory.registerSimpleDataType("IntervalMillisecond", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Millisecond)); }); + factory.registerSimpleDataType("IntervalSecond", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Second)); }); + factory.registerSimpleDataType("IntervalMinute", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Minute)); }); + factory.registerSimpleDataType("IntervalHour", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Hour)); }); + factory.registerSimpleDataType("IntervalDay", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Day)); }); + factory.registerSimpleDataType("IntervalWeek", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Week)); }); + factory.registerSimpleDataType("IntervalMonth", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Month)); }); + factory.registerSimpleDataType("IntervalQuarter", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Quarter)); }); + factory.registerSimpleDataType("IntervalYear", [] { return DataTypePtr(std::make_shared<DataTypeInterval>(IntervalKind::Year)); }); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeInterval.h b/contrib/clickhouse/src/DataTypes/DataTypeInterval.h new file mode 100644 index 00000000000..c398a54268e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeInterval.h @@ -0,0 +1,41 @@ +#pragma once + +#include <DataTypes/DataTypeNumberBase.h> +#include <Common/IntervalKind.h> + + +namespace DB +{ + +/** Data type to deal with INTERVAL in SQL (arithmetic on time intervals). + * + * Mostly the same as Int64. + * But also tagged with interval kind. + */ +class DataTypeInterval final : public DataTypeNumberBase<Int64> +{ +private: + IntervalKind kind; + +public: + static constexpr bool is_parametric = true; + + IntervalKind getKind() const { return kind; } + + explicit DataTypeInterval(IntervalKind kind_) : kind(kind_) {} + + SerializationPtr doGetDefaultSerialization() const override; + std::string doGetName() const override { return fmt::format("Interval{}", kind.toString()); } + const char * getFamilyName() const override { return "Interval"; } + String getSQLCompatibleName() const override { return "TEXT"; } + TypeIndex getTypeId() const override { return TypeIndex::Interval; } + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool isCategorial() const override { return false; } + bool canBeInsideNullable() const override { return true; } +}; + +} + diff --git a/contrib/clickhouse/src/DataTypes/DataTypeLowCardinality.cpp b/contrib/clickhouse/src/DataTypes/DataTypeLowCardinality.cpp new file mode 100644 index 00000000000..8293455cabc --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeLowCardinality.cpp @@ -0,0 +1,179 @@ +#include <Columns/ColumnFixedString.h> +#include <Columns/ColumnLowCardinality.h> +#include <Columns/ColumnUnique.h> +#include <Columns/ColumnsCommon.h> +#include <Common/HashTable/HashMap.h> +#include <Common/assert_cast.h> +#include <Common/typeid_cast.h> +#include <Core/Field.h> +#include <base/TypeLists.h> +#include <DataTypes/DataTypeDate.h> +#include <DataTypes/DataTypeDateTime.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/DataTypeInterval.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/Serializations/SerializationLowCardinality.h> +#include <Parsers/IAST.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int LOGICAL_ERROR; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_) + : dictionary_type(std::move(dictionary_type_)) +{ + auto inner_type = dictionary_type; + if (dictionary_type->isNullable()) + inner_type = static_cast<const DataTypeNullable &>(*dictionary_type).getNestedType(); + + if (!inner_type->canBeInsideLowCardinality()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "DataTypeLowCardinality is supported only for numbers, strings, Date or DateTime, but got {}", + dictionary_type->getName()); +} + +namespace +{ + template <typename Creator> + struct CreateColumnVector + { + MutableColumnUniquePtr & column; + const IDataType & keys_type; + const Creator & creator; + + CreateColumnVector(MutableColumnUniquePtr & column_, const IDataType & keys_type_, const Creator & creator_) + : column(column_), keys_type(keys_type_), creator(creator_) + { + } + + template <typename T> + void operator()(TypeList<T>) + { + if (typeid_cast<const DataTypeNumber<T> *>(&keys_type)) + column = creator(static_cast<ColumnVector<T> *>(nullptr)); + } + }; +} + +template <typename Creator> +MutableColumnUniquePtr DataTypeLowCardinality::createColumnUniqueImpl(const IDataType & keys_type, + const Creator & creator) +{ + const auto * type = &keys_type; + if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(&keys_type)) + type = nullable_type->getNestedType().get(); + + WhichDataType which(type); + + if (which.isString()) + return creator(static_cast<ColumnString *>(nullptr)); + else if (which.isFixedString()) + return creator(static_cast<ColumnFixedString *>(nullptr)); + else if (which.isDate()) + return creator(static_cast<ColumnVector<UInt16> *>(nullptr)); + else if (which.isDate32()) + return creator(static_cast<ColumnVector<Int32> *>(nullptr)); + else if (which.isDateTime()) + return creator(static_cast<ColumnVector<UInt32> *>(nullptr)); + else if (which.isUUID()) + return creator(static_cast<ColumnVector<UUID> *>(nullptr)); + else if (which.isIPv4()) + return creator(static_cast<ColumnVector<IPv4> *>(nullptr)); + else if (which.isIPv6()) + return creator(static_cast<ColumnVector<IPv6> *>(nullptr)); + else if (which.isInterval()) + return creator(static_cast<DataTypeInterval::ColumnType *>(nullptr)); + else if (which.isInt() || which.isUInt() || which.isFloat()) + { + MutableColumnUniquePtr column; + TypeListUtils::forEach(TypeListIntAndFloat{}, CreateColumnVector(column, *type, creator)); + + if (!column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected numeric type: {}", type->getName()); + + return column; + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected dictionary type for DataTypeLowCardinality: {}", + type->getName()); +} + + +MutableColumnUniquePtr DataTypeLowCardinality::createColumnUnique(const IDataType & keys_type) +{ + auto creator = [&](auto x) + { + using ColumnType = typename std::remove_pointer<decltype(x)>::type; + return ColumnUnique<ColumnType>::create(keys_type); + }; + return createColumnUniqueImpl(keys_type, creator); +} + +MutableColumnUniquePtr DataTypeLowCardinality::createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys) +{ + auto creator = [&](auto x) + { + using ColumnType = typename std::remove_pointer<decltype(x)>::type; + return ColumnUnique<ColumnType>::create(std::move(keys), keys_type.isNullable()); + }; + return createColumnUniqueImpl(keys_type, creator); +} + +MutableColumnPtr DataTypeLowCardinality::createColumn() const +{ + MutableColumnPtr indexes = DataTypeUInt8().createColumn(); + MutableColumnPtr dictionary = createColumnUnique(*dictionary_type); + return ColumnLowCardinality::create(std::move(dictionary), std::move(indexes)); +} + +Field DataTypeLowCardinality::getDefault() const +{ + return dictionary_type->getDefault(); +} + +bool DataTypeLowCardinality::equals(const IDataType & rhs) const +{ + if (typeid(rhs) != typeid(*this)) + return false; + + const auto & low_cardinality_rhs= static_cast<const DataTypeLowCardinality &>(rhs); + return dictionary_type->equals(*low_cardinality_rhs.dictionary_type); +} + +SerializationPtr DataTypeLowCardinality::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationLowCardinality>(dictionary_type); +} + + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "LowCardinality data type family must have single argument - type of elements"); + + return std::make_shared<DataTypeLowCardinality>(DataTypeFactory::instance().get(arguments->children[0])); +} + +void registerDataTypeLowCardinality(DataTypeFactory & factory) +{ + factory.registerDataType("LowCardinality", create); +} + + +DataTypePtr removeLowCardinality(const DataTypePtr & type) +{ + if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(type.get())) + return low_cardinality_type->getDictionaryType(); + return type; +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeLowCardinality.h b/contrib/clickhouse/src/DataTypes/DataTypeLowCardinality.h new file mode 100644 index 00000000000..d2a414cb073 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeLowCardinality.h @@ -0,0 +1,95 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <Columns/IColumnUnique.h> + + +namespace DB +{ + +class DataTypeLowCardinality : public IDataType +{ +private: + DataTypePtr dictionary_type; + + +public: + explicit DataTypeLowCardinality(DataTypePtr dictionary_type_); + + const DataTypePtr & getDictionaryType() const { return dictionary_type; } + + String doGetName() const override + { + return "LowCardinality(" + dictionary_type->getName() + ")"; + } + const char * getFamilyName() const override { return "LowCardinality"; } + String getSQLCompatibleName() const override { return dictionary_type->getSQLCompatibleName(); } + + TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return true; } + bool cannotBeStoredInTables() const override { return dictionary_type->cannotBeStoredInTables(); } + bool shouldAlignRightInPrettyFormats() const override { return dictionary_type->shouldAlignRightInPrettyFormats(); } + bool textCanContainOnlyValidUTF8() const override { return dictionary_type->textCanContainOnlyValidUTF8(); } + bool isComparable() const override { return dictionary_type->isComparable(); } + bool canBeComparedWithCollation() const override { return dictionary_type->canBeComparedWithCollation(); } + bool canBeUsedAsVersion() const override { return dictionary_type->canBeUsedAsVersion(); } + bool isSummable() const override { return dictionary_type->isSummable(); } + bool canBeUsedInBitOperations() const override { return dictionary_type->canBeUsedInBitOperations(); } + bool canBeUsedInBooleanContext() const override { return dictionary_type->canBeUsedInBooleanContext(); } + bool isValueRepresentedByNumber() const override { return dictionary_type->isValueRepresentedByNumber(); } + bool isValueRepresentedByInteger() const override { return dictionary_type->isValueRepresentedByInteger(); } + bool isValueRepresentedByUnsignedInteger() const override { return dictionary_type->isValueRepresentedByUnsignedInteger(); } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return dictionary_type->haveMaximumSizeOfValue(); } + size_t getMaximumSizeOfValueInMemory() const override { return dictionary_type->getMaximumSizeOfValueInMemory(); } + size_t getSizeOfValueInMemory() const override { return dictionary_type->getSizeOfValueInMemory(); } + bool isCategorial() const override { return false; } + bool isNullable() const override { return false; } + bool onlyNull() const override { return false; } + bool lowCardinality() const override { return true; } + bool supportsSparseSerialization() const override { return false; } + bool isLowCardinalityNullable() const override { return dictionary_type->isNullable(); } + + static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type); + static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys); + +private: + SerializationPtr doGetDefaultSerialization() const override; + + template <typename ... Params> + using SerializeFunctionPtr = void (IDataType::*)(const IColumn &, size_t, Params ...) const; + + template <typename... Params, typename... Args> + void serializeImpl(const IColumn & column, size_t row_num, SerializeFunctionPtr<Params...> func, Args &&... args) const; + + template <typename ... Params> + using DeserializeFunctionPtr = void (IDataType::*)(IColumn &, Params ...) const; + + template <typename ... Params, typename... Args> + void deserializeImpl(IColumn & column, DeserializeFunctionPtr<Params...> func, Args &&... args) const; + + template <typename Creator> + static MutableColumnUniquePtr createColumnUniqueImpl(const IDataType & keys_type, const Creator & creator); +}; + +/// Returns dictionary type if type is DataTypeLowCardinality, type otherwise. +DataTypePtr removeLowCardinality(const DataTypePtr & type); + +/// Remove LowCardinality recursively from all nested types. +DataTypePtr recursiveRemoveLowCardinality(const DataTypePtr & type); + +/// Remove LowCardinality recursively from all nested columns. +ColumnPtr recursiveRemoveLowCardinality(const ColumnPtr & column); + +/// Convert column of type from_type to type to_type by converting nested LowCardinality columns. +ColumnPtr recursiveLowCardinalityTypeConversion(const ColumnPtr & column, const DataTypePtr & from_type, const DataTypePtr & to_type); + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeLowCardinalityHelpers.cpp b/contrib/clickhouse/src/DataTypes/DataTypeLowCardinalityHelpers.cpp new file mode 100644 index 00000000000..98eb76267a4 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeLowCardinalityHelpers.cpp @@ -0,0 +1,208 @@ +#include <Columns/ColumnArray.h> +#include <Columns/ColumnConst.h> +#include <Columns/ColumnTuple.h> +#include <Columns/ColumnMap.h> +#include <Columns/ColumnLowCardinality.h> +#include <Columns/ColumnFunction.h> + +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeMap.h> + +#include <Common/assert_cast.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int TYPE_MISMATCH; +} + +DataTypePtr recursiveRemoveLowCardinality(const DataTypePtr & type) +{ + if (!type) + return type; + + if (const auto * array_type = typeid_cast<const DataTypeArray *>(type.get())) + return std::make_shared<DataTypeArray>(recursiveRemoveLowCardinality(array_type->getNestedType())); + + if (const auto * tuple_type = typeid_cast<const DataTypeTuple *>(type.get())) + { + DataTypes elements = tuple_type->getElements(); + for (auto & element : elements) + element = recursiveRemoveLowCardinality(element); + + if (tuple_type->haveExplicitNames()) + return std::make_shared<DataTypeTuple>(elements, tuple_type->getElementNames()); + else + return std::make_shared<DataTypeTuple>(elements); + } + + if (const auto * map_type = typeid_cast<const DataTypeMap *>(type.get())) + { + return std::make_shared<DataTypeMap>(recursiveRemoveLowCardinality(map_type->getKeyType()), recursiveRemoveLowCardinality(map_type->getValueType())); + } + + if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(type.get())) + return low_cardinality_type->getDictionaryType(); + + return type; +} + +ColumnPtr recursiveRemoveLowCardinality(const ColumnPtr & column) +{ + if (!column) + return column; + + if (const auto * column_array = typeid_cast<const ColumnArray *>(column.get())) + { + const auto & data = column_array->getDataPtr(); + auto data_no_lc = recursiveRemoveLowCardinality(data); + if (data.get() == data_no_lc.get()) + return column; + + return ColumnArray::create(data_no_lc, column_array->getOffsetsPtr()); + } + + if (const auto * column_const = typeid_cast<const ColumnConst *>(column.get())) + { + const auto & nested = column_const->getDataColumnPtr(); + auto nested_no_lc = recursiveRemoveLowCardinality(nested); + if (nested.get() == nested_no_lc.get()) + return column; + + return ColumnConst::create(nested_no_lc, column_const->size()); + } + + if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get())) + { + auto columns = column_tuple->getColumns(); + for (auto & element : columns) + element = recursiveRemoveLowCardinality(element); + return ColumnTuple::create(columns); + } + + if (const auto * column_map = typeid_cast<const ColumnMap *>(column.get())) + { + const auto & nested = column_map->getNestedColumnPtr(); + auto nested_no_lc = recursiveRemoveLowCardinality(nested); + if (nested.get() == nested_no_lc.get()) + return column; + + return ColumnMap::create(nested_no_lc); + } + + /// Special case when column is a lazy argument of short circuit function. + /// We should call recursiveRemoveLowCardinality on the result column + /// when function will be executed. + if (const auto * column_function = typeid_cast<const ColumnFunction *>(column.get())) + { + if (!column_function->isShortCircuitArgument()) + return column; + + return column_function->recursivelyConvertResultToFullColumnIfLowCardinality(); + } + + if (const auto * column_low_cardinality = typeid_cast<const ColumnLowCardinality *>(column.get())) + return column_low_cardinality->convertToFullColumn(); + + return column; +} + +ColumnPtr recursiveLowCardinalityTypeConversion(const ColumnPtr & column, const DataTypePtr & from_type, const DataTypePtr & to_type) +{ + if (!column) + return column; + + if (from_type->equals(*to_type)) + return column; + + /// We can allow insert enum column if it's numeric type is the same as the column's type in table. + if (WhichDataType(to_type).isEnum() && from_type->getTypeId() == to_type->getTypeId()) + return column; + + if (const auto * column_const = typeid_cast<const ColumnConst *>(column.get())) + { + const auto & nested = column_const->getDataColumnPtr(); + auto nested_no_lc = recursiveLowCardinalityTypeConversion(nested, from_type, to_type); + if (nested.get() == nested_no_lc.get()) + return column; + + return ColumnConst::create(nested_no_lc, column_const->size()); + } + + if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(from_type.get())) + { + if (to_type->equals(*low_cardinality_type->getDictionaryType())) + return column->convertToFullColumnIfLowCardinality(); + } + + if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(to_type.get())) + { + if (from_type->equals(*low_cardinality_type->getDictionaryType())) + { + auto col = low_cardinality_type->createColumn(); + assert_cast<ColumnLowCardinality &>(*col).insertRangeFromFullColumn(*column, 0, column->size()); + return col; + } + } + + if (const auto * from_array_type = typeid_cast<const DataTypeArray *>(from_type.get())) + { + if (const auto * to_array_type = typeid_cast<const DataTypeArray *>(to_type.get())) + { + const auto * column_array = typeid_cast<const ColumnArray *>(column.get()); + if (!column_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Unexpected column {} for type {}", + column->getName(), from_type->getName()); + + const auto & nested_from = from_array_type->getNestedType(); + const auto & nested_to = to_array_type->getNestedType(); + + return ColumnArray::create( + recursiveLowCardinalityTypeConversion(column_array->getDataPtr(), nested_from, nested_to), + column_array->getOffsetsPtr()); + } + } + + if (const auto * from_tuple_type = typeid_cast<const DataTypeTuple *>(from_type.get())) + { + if (const auto * to_tuple_type = typeid_cast<const DataTypeTuple *>(to_type.get())) + { + const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get()); + if (!column_tuple) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Unexpected column {} for type {}", + column->getName(), from_type->getName()); + + auto columns = column_tuple->getColumns(); + const auto & from_elements = from_tuple_type->getElements(); + const auto & to_elements = to_tuple_type->getElements(); + + bool has_converted = false; + + for (size_t i = 0; i < columns.size(); ++i) + { + auto & element = columns[i]; + auto element_no_lc = recursiveLowCardinalityTypeConversion(element, from_elements.at(i), to_elements.at(i)); + if (element.get() != element_no_lc.get()) + { + element = element_no_lc; + has_converted = true; + } + } + + if (!has_converted) + return column; + + return ColumnTuple::create(columns); + } + } + + throw Exception(ErrorCodes::TYPE_MISMATCH, "Cannot convert: {} to {}", from_type->getName(), to_type->getName()); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeMap.cpp b/contrib/clickhouse/src/DataTypes/DataTypeMap.cpp new file mode 100644 index 00000000000..90561857fad --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeMap.cpp @@ -0,0 +1,158 @@ +#include <base/map.h> +#include <Common/StringUtils/StringUtils.h> +#include <Columns/ColumnMap.h> +#include <Core/Field.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationMap.h> +#include <Parsers/IAST.h> +#include <IO/WriteBufferFromString.h> +#include <IO/Operators.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; +} + +DataTypeMap::DataTypeMap(const DataTypePtr & nested_) + : nested(nested_) +{ + const auto * type_array = typeid_cast<const DataTypeArray *>(nested.get()); + if (!type_array) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected Array(Tuple(key, value)) type, got {}", nested->getName()); + + const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type_array->getNestedType().get()); + if (!type_tuple) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected Array(Tuple(key, value)) type, got {}", nested->getName()); + + if (type_tuple->getElements().size() != 2) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expected Array(Tuple(key, value)) type, got {}", nested->getName()); + + key_type = type_tuple->getElement(0); + value_type = type_tuple->getElement(1); + assertKeyType(); +} + +DataTypeMap::DataTypeMap(const DataTypes & elems_) +{ + assert(elems_.size() == 2); + key_type = elems_[0]; + value_type = elems_[1]; + + assertKeyType(); + + nested = std::make_shared<DataTypeArray>( + std::make_shared<DataTypeTuple>(DataTypes{key_type, value_type}, Names{"keys", "values"})); +} + +DataTypeMap::DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & value_type_) + : key_type(key_type_), value_type(value_type_) + , nested(std::make_shared<DataTypeArray>( + std::make_shared<DataTypeTuple>(DataTypes{key_type_, value_type_}, Names{"keys", "values"}))) +{ + assertKeyType(); +} + +void DataTypeMap::assertKeyType() const +{ + if (!checkKeyType(key_type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Type of Map key must be a type, that can be represented by integer " + "or String or FixedString (possibly LowCardinality) or UUID or IPv6," + " but {} given", key_type->getName()); +} + + +std::string DataTypeMap::doGetName() const +{ + WriteBufferFromOwnString s; + s << "Map(" << key_type->getName() << ", " << value_type->getName() << ")"; + + return s.str(); +} + +MutableColumnPtr DataTypeMap::createColumn() const +{ + return ColumnMap::create(nested->createColumn()); +} + +Field DataTypeMap::getDefault() const +{ + return Map(); +} + +SerializationPtr DataTypeMap::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationMap>( + key_type->getDefaultSerialization(), + value_type->getDefaultSerialization(), + nested->getDefaultSerialization()); +} + +bool DataTypeMap::equals(const IDataType & rhs) const +{ + if (typeid(rhs) != typeid(*this)) + return false; + + const DataTypeMap & rhs_map = static_cast<const DataTypeMap &>(rhs); + return nested->equals(*rhs_map.nested); +} + +bool DataTypeMap::checkKeyType(DataTypePtr key_type) +{ + if (key_type->getTypeId() == TypeIndex::LowCardinality) + { + const auto & low_cardinality_data_type = assert_cast<const DataTypeLowCardinality &>(*key_type); + if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType()))) + return false; + } + else if (!key_type->isValueRepresentedByInteger() + && !isStringOrFixedString(*key_type) + && !WhichDataType(key_type).isNothing() + && !WhichDataType(key_type).isIPv6() + && !WhichDataType(key_type).isUUID()) + { + return false; + } + + return true; +} + +DataTypePtr DataTypeMap::getNestedTypeWithUnnamedTuple() const +{ + const auto & from_array = assert_cast<const DataTypeArray &>(*nested); + const auto & from_tuple = assert_cast<const DataTypeTuple &>(*from_array.getNestedType()); + return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(from_tuple.getElements())); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Map data type family must have two arguments: key and value types"); + + DataTypes nested_types; + nested_types.reserve(arguments->children.size()); + + for (const ASTPtr & child : arguments->children) + nested_types.emplace_back(DataTypeFactory::instance().get(child)); + + return std::make_shared<DataTypeMap>(nested_types); +} + + +void registerDataTypeMap(DataTypeFactory & factory) +{ + factory.registerDataType("Map", create); +} +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeMap.h b/contrib/clickhouse/src/DataTypes/DataTypeMap.h new file mode 100644 index 00000000000..294c5d7ac77 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeMap.h @@ -0,0 +1,62 @@ +#pragma once + +#include <DataTypes/IDataType.h> + + +namespace DB +{ + +/** Map data type. + * Map is implemented as two arrays of keys and values. + * Serialization of type 'Map(K, V)' is similar to serialization. + * of 'Array(Tuple(keys K, values V))' or in other words of 'Nested(keys K, valuev V)'. + */ +class DataTypeMap final : public IDataType +{ +private: + DataTypePtr key_type; + DataTypePtr value_type; + + /// 'nested' is an Array(Tuple(key_type, value_type)) + DataTypePtr nested; + +public: + static constexpr bool is_parametric = true; + + explicit DataTypeMap(const DataTypePtr & nested_); + explicit DataTypeMap(const DataTypes & elems); + DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & value_type_); + + TypeIndex getTypeId() const override { return TypeIndex::Map; } + std::string doGetName() const override; + const char * getFamilyName() const override { return "Map"; } + String getSQLCompatibleName() const override { return "JSON"; } + + bool canBeInsideNullable() const override { return false; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + bool isComparable() const override { return key_type->isComparable() && value_type->isComparable(); } + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return true; } + bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); } + + const DataTypePtr & getKeyType() const { return key_type; } + const DataTypePtr & getValueType() const { return value_type; } + DataTypes getKeyValueTypes() const { return {key_type, value_type}; } + const DataTypePtr & getNestedType() const { return nested; } + DataTypePtr getNestedTypeWithUnnamedTuple() const; + + SerializationPtr doGetDefaultSerialization() const override; + + static bool checkKeyType(DataTypePtr key_type); + +private: + void assertKeyType() const; +}; + +} + diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNested.cpp b/contrib/clickhouse/src/DataTypes/DataTypeNested.cpp new file mode 100644 index 00000000000..a4b7442393c --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNested.cpp @@ -0,0 +1,75 @@ +#include <DataTypes/DataTypeNested.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeTuple.h> +#include <IO/Operators.h> +#include <Common/quoteString.h> +#include <Parsers/ASTNameTypePair.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int EMPTY_DATA_PASSED; + extern const int BAD_ARGUMENTS; +} + +String DataTypeNestedCustomName::getName() const +{ + WriteBufferFromOwnString s; + s << "Nested("; + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + s << ", "; + + s << backQuoteIfNeed(names[i]) << ' '; + s << elems[i]->getName(); + } + s << ")"; + + return s.str(); +} + +static std::pair<DataTypePtr, DataTypeCustomDescPtr> create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Nested cannot be empty"); + + DataTypes nested_types; + Strings nested_names; + nested_types.reserve(arguments->children.size()); + nested_names.reserve(arguments->children.size()); + + for (const auto & child : arguments->children) + { + const auto * name_type = child->as<ASTNameTypePair>(); + if (!name_type) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Data type Nested accepts only pairs with name and type"); + + auto nested_type = DataTypeFactory::instance().get(name_type->type); + nested_types.push_back(std::move(nested_type)); + nested_names.push_back(name_type->name); + } + + auto data_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(nested_types, nested_names)); + auto custom_name = std::make_unique<DataTypeNestedCustomName>(nested_types, nested_names); + + return std::make_pair(std::move(data_type), std::make_unique<DataTypeCustomDesc>(std::move(custom_name))); +} + +void registerDataTypeNested(DataTypeFactory & factory) +{ + return factory.registerDataTypeCustom("Nested", create); +} + +DataTypePtr createNested(const DataTypes & types, const Names & names) +{ + auto custom_desc = std::make_unique<DataTypeCustomDesc>( + std::make_unique<DataTypeNestedCustomName>(types, names)); + + return DataTypeFactory::instance().getCustom(std::move(custom_desc)); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNested.h b/contrib/clickhouse/src/DataTypes/DataTypeNested.h new file mode 100644 index 00000000000..1ad06477a6e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNested.h @@ -0,0 +1,33 @@ +#pragma once + +#include <DataTypes/IDataType.h> + + +namespace DB +{ + +class DataTypeNestedCustomName final : public IDataTypeCustomName +{ +private: + DataTypes elems; + Strings names; + +public: + DataTypeNestedCustomName(const DataTypes & elems_, const Strings & names_) + : elems(elems_), names(names_) + { + } + + String getName() const override; +}; + +DataTypePtr createNested(const DataTypes & types, const Names & names); + +template <typename DataType> +inline bool isNested(const DataType & data_type) +{ + return typeid_cast<const DataTypeNestedCustomName *>(data_type->getCustomName()) != nullptr; +} + +} + diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNothing.cpp b/contrib/clickhouse/src/DataTypes/DataTypeNothing.cpp new file mode 100644 index 00000000000..c2b552035a0 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNothing.cpp @@ -0,0 +1,31 @@ +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/Serializations/SerializationNothing.h> +#include <DataTypes/DataTypeFactory.h> +#include <Columns/ColumnNothing.h> + + +namespace DB +{ + +MutableColumnPtr DataTypeNothing::createColumn() const +{ + return ColumnNothing::create(0); +} + +bool DataTypeNothing::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this); +} + +SerializationPtr DataTypeNothing::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationNothing>(); +} + + +void registerDataTypeNothing(DataTypeFactory & factory) +{ + factory.registerSimpleDataType("Nothing", [] { return DataTypePtr(std::make_shared<DataTypeNothing>()); }); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNothing.h b/contrib/clickhouse/src/DataTypes/DataTypeNothing.h new file mode 100644 index 00000000000..c3a7e2d09f0 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNothing.h @@ -0,0 +1,36 @@ +#pragma once + +#include <DataTypes/IDataTypeDummy.h> + + +namespace DB +{ + +/** Data type that cannot have any values. + * Used to represent NULL of unknown type as Nullable(Nothing), + * and possibly for empty array of unknown type as Array(Nothing). + */ +class DataTypeNothing final : public IDataTypeDummy +{ +public: + static constexpr bool is_parametric = false; + + const char * getFamilyName() const override { return "Nothing"; } + String getSQLCompatibleName() const override { return "TEXT"; } + + TypeIndex getTypeId() const override { return TypeIndex::Nothing; } + + MutableColumnPtr createColumn() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return false; } + bool textCanContainOnlyValidUTF8() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + size_t getSizeOfValueInMemory() const override { return 0; } + bool canBeInsideNullable() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNullable.cpp b/contrib/clickhouse/src/DataTypes/DataTypeNullable.cpp new file mode 100644 index 00000000000..41a9a1de543 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNullable.cpp @@ -0,0 +1,118 @@ +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationNullable.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <Columns/ColumnNullable.h> +#include <Core/Field.h> +#include <Parsers/IAST.h> +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + + +DataTypeNullable::DataTypeNullable(const DataTypePtr & nested_data_type_) + : nested_data_type{nested_data_type_} +{ + if (!nested_data_type->canBeInsideNullable()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Nested type {} cannot be inside Nullable type", nested_data_type->getName()); +} + + +bool DataTypeNullable::onlyNull() const +{ + return typeid_cast<const DataTypeNothing *>(nested_data_type.get()); +} + + +MutableColumnPtr DataTypeNullable::createColumn() const +{ + return ColumnNullable::create(nested_data_type->createColumn(), ColumnUInt8::create()); +} + +Field DataTypeNullable::getDefault() const +{ + return Null(); +} + +size_t DataTypeNullable::getSizeOfValueInMemory() const +{ + throw Exception(ErrorCodes::LOGICAL_ERROR, "Value of type {} in memory is not of fixed size.", getName()); +} + + +bool DataTypeNullable::equals(const IDataType & rhs) const +{ + return rhs.isNullable() && nested_data_type->equals(*static_cast<const DataTypeNullable &>(rhs).nested_data_type); +} + +SerializationPtr DataTypeNullable::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationNullable>(nested_data_type->getDefaultSerialization()); +} + + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Nullable data type family must have exactly one argument - nested type"); + + DataTypePtr nested_type = DataTypeFactory::instance().get(arguments->children[0]); + + return std::make_shared<DataTypeNullable>(nested_type); +} + + +void registerDataTypeNullable(DataTypeFactory & factory) +{ + factory.registerDataType("Nullable", create); +} + + +DataTypePtr makeNullable(const DataTypePtr & type) +{ + if (type->isNullable()) + return type; + return std::make_shared<DataTypeNullable>(type); +} + +DataTypePtr makeNullableSafe(const DataTypePtr & type) +{ + if (type->canBeInsideNullable()) + return makeNullable(type); + return type; +} + +DataTypePtr removeNullable(const DataTypePtr & type) +{ + if (type->isNullable()) + return static_cast<const DataTypeNullable &>(*type).getNestedType(); + return type; +} + +DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type) +{ + if (isNullableOrLowCardinalityNullable(type)) + return type; + + if (type->lowCardinality()) + { + const auto & dictionary_type = assert_cast<const DataTypeLowCardinality &>(*type).getDictionaryType(); + return std::make_shared<DataTypeLowCardinality>(makeNullable(dictionary_type)); + } + + return std::make_shared<DataTypeNullable>(type); +} + + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNullable.h b/contrib/clickhouse/src/DataTypes/DataTypeNullable.h new file mode 100644 index 00000000000..e3165414c07 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNullable.h @@ -0,0 +1,59 @@ +#pragma once + +#include <DataTypes/IDataType.h> + +namespace DB +{ + +/// A nullable data type is an ordinary data type provided with a tag +/// indicating that it also contains the NULL value. The following class +/// embodies this concept. +class DataTypeNullable final : public IDataType +{ +public: + static constexpr bool is_parametric = true; + + explicit DataTypeNullable(const DataTypePtr & nested_data_type_); + std::string doGetName() const override { return "Nullable(" + nested_data_type->getName() + ")"; } + const char * getFamilyName() const override { return "Nullable"; } + String getSQLCompatibleName() const override { return nested_data_type->getSQLCompatibleName(); } + TypeIndex getTypeId() const override { return TypeIndex::Nullable; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return true; } + bool cannotBeStoredInTables() const override { return nested_data_type->cannotBeStoredInTables(); } + bool shouldAlignRightInPrettyFormats() const override { return nested_data_type->shouldAlignRightInPrettyFormats(); } + bool textCanContainOnlyValidUTF8() const override { return nested_data_type->textCanContainOnlyValidUTF8(); } + bool isComparable() const override { return nested_data_type->isComparable(); } + bool canBeComparedWithCollation() const override { return nested_data_type->canBeComparedWithCollation(); } + bool canBeUsedAsVersion() const override { return false; } + bool isSummable() const override { return nested_data_type->isSummable(); } + bool canBeUsedInBooleanContext() const override { return nested_data_type->canBeUsedInBooleanContext() || onlyNull(); } + bool haveMaximumSizeOfValue() const override { return nested_data_type->haveMaximumSizeOfValue(); } + size_t getMaximumSizeOfValueInMemory() const override { return 1 + nested_data_type->getMaximumSizeOfValueInMemory(); } + bool isNullable() const override { return true; } + size_t getSizeOfValueInMemory() const override; + bool onlyNull() const override; + bool canBeInsideLowCardinality() const override { return nested_data_type->canBeInsideLowCardinality(); } + bool canBePromoted() const override { return nested_data_type->canBePromoted(); } + + const DataTypePtr & getNestedType() const { return nested_data_type; } +private: + SerializationPtr doGetDefaultSerialization() const override; + + DataTypePtr nested_data_type; +}; + + +DataTypePtr makeNullable(const DataTypePtr & type); +DataTypePtr makeNullableSafe(const DataTypePtr & type); +DataTypePtr removeNullable(const DataTypePtr & type); +DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type); + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNumberBase.cpp b/contrib/clickhouse/src/DataTypes/DataTypeNumberBase.cpp new file mode 100644 index 00000000000..4cefc4945c6 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNumberBase.cpp @@ -0,0 +1,76 @@ +#include <type_traits> +#include <DataTypes/DataTypeNumberBase.h> +#include <Columns/ColumnVector.h> + + +namespace DB +{ + +template <typename T> +Field DataTypeNumberBase<T>::getDefault() const +{ + return NearestFieldType<FieldType>(); +} +template <typename T> +String DataTypeNumberBase<T>::getSQLCompatibleName() const +{ + if constexpr (std::is_same_v<T, Int8>) + return "TINYINT"; + else if constexpr (std::is_same_v<T, Int16>) + return "SMALLINT"; + else if constexpr (std::is_same_v<T, Int32>) + return "INTEGER"; + else if constexpr (std::is_same_v<T, Int64>) + return "BIGINT"; + else if constexpr (std::is_same_v<T, UInt8>) + return "TINYINT UNSIGNED"; + else if constexpr (std::is_same_v<T, UInt16>) + return "SMALLINT UNSIGNED"; + else if constexpr (std::is_same_v<T, UInt32>) + return "INTEGER UNSIGNED"; + else if constexpr (std::is_same_v<T, UInt64>) + return "BIGINT UNSIGNED"; + else if constexpr (std::is_same_v<T, Float32>) + return "FLOAT"; + else if constexpr (std::is_same_v<T, Float64>) + return "DOUBLE"; + /// Unsupported types are converted to TEXT + else + return "TEXT"; +} + +template <typename T> +MutableColumnPtr DataTypeNumberBase<T>::createColumn() const +{ + return ColumnVector<T>::create(); +} + +template <typename T> +bool DataTypeNumberBase<T>::isValueRepresentedByInteger() const +{ + return is_integer<T>; +} + +template <typename T> +bool DataTypeNumberBase<T>::isValueRepresentedByUnsignedInteger() const +{ + return is_integer<T> && is_unsigned_v<T>; +} + +/// Explicit template instantiations - to avoid code bloat in headers. +template class DataTypeNumberBase<UInt8>; +template class DataTypeNumberBase<UInt16>; +template class DataTypeNumberBase<UInt32>; +template class DataTypeNumberBase<UInt64>; +template class DataTypeNumberBase<UInt128>; +template class DataTypeNumberBase<UInt256>; +template class DataTypeNumberBase<Int8>; +template class DataTypeNumberBase<Int16>; +template class DataTypeNumberBase<Int32>; +template class DataTypeNumberBase<Int64>; +template class DataTypeNumberBase<Int128>; +template class DataTypeNumberBase<Int256>; +template class DataTypeNumberBase<Float32>; +template class DataTypeNumberBase<Float64>; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeNumberBase.h b/contrib/clickhouse/src/DataTypes/DataTypeNumberBase.h new file mode 100644 index 00000000000..d902c62505e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeNumberBase.h @@ -0,0 +1,75 @@ +#pragma once + +#include <base/TypeName.h> +#include <Core/TypeId.h> +#include <DataTypes/IDataType.h> +#include <DataTypes/Serializations/SerializationNumber.h> + + +namespace DB +{ + +/** Implements part of the IDataType interface, common to all numbers and for Date and DateTime. + */ +template <typename T> +class DataTypeNumberBase : public IDataType +{ + static_assert(is_arithmetic_v<T>); + +public: + static constexpr bool is_parametric = false; + static constexpr auto family_name = TypeName<T>; + static constexpr auto type_id = TypeToTypeIndex<T>; + + using FieldType = T; + using ColumnType = ColumnVector<T>; + + const char * getFamilyName() const override { return TypeName<T>.data(); } + String getSQLCompatibleName() const override; + TypeIndex getTypeId() const override { return TypeToTypeIndex<T>; } + + Field getDefault() const override; + + MutableColumnPtr createColumn() const override; + + bool isParametric() const override { return false; } + bool haveSubtypes() const override { return false; } + + bool shouldAlignRightInPrettyFormats() const override + { + /// Just a number, without customizations. Counterexample: IPv4. + return !custom_serialization; + } + + bool textCanContainOnlyValidUTF8() const override { return true; } + bool isComparable() const override { return true; } + bool isValueRepresentedByNumber() const override { return true; } + bool isValueRepresentedByInteger() const override; + bool isValueRepresentedByUnsignedInteger() const override; + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + size_t getSizeOfValueInMemory() const override { return sizeof(T); } + bool isCategorial() const override { return isValueRepresentedByInteger(); } + bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { return std::make_shared<SerializationNumber<T>>(); } +}; + +/// Prevent implicit template instantiation of DataTypeNumberBase for common numeric types + +extern template class DataTypeNumberBase<UInt8>; +extern template class DataTypeNumberBase<UInt16>; +extern template class DataTypeNumberBase<UInt32>; +extern template class DataTypeNumberBase<UInt64>; +extern template class DataTypeNumberBase<UInt128>; +extern template class DataTypeNumberBase<UInt256>; +extern template class DataTypeNumberBase<Int16>; +extern template class DataTypeNumberBase<Int8>; +extern template class DataTypeNumberBase<Int32>; +extern template class DataTypeNumberBase<Int64>; +extern template class DataTypeNumberBase<Int128>; +extern template class DataTypeNumberBase<Int256>; +extern template class DataTypeNumberBase<Float32>; +extern template class DataTypeNumberBase<Float64>; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeObject.cpp b/contrib/clickhouse/src/DataTypes/DataTypeObject.cpp new file mode 100644 index 00000000000..720436d0e0d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeObject.cpp @@ -0,0 +1,82 @@ +#include <DataTypes/DataTypeObject.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationObject.h> + +#include <Parsers/IAST.h> +#include <Parsers/ASTLiteral.h> +#include <Parsers/ASTFunction.h> +#include <IO/Operators.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNEXPECTED_AST_STRUCTURE; +} + +DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_) + : schema_format(Poco::toLower(schema_format_)) + , is_nullable(is_nullable_) +{ +} + +bool DataTypeObject::equals(const IDataType & rhs) const +{ + if (const auto * object = typeid_cast<const DataTypeObject *>(&rhs)) + return schema_format == object->schema_format && is_nullable == object->is_nullable; + return false; +} + +SerializationPtr DataTypeObject::doGetDefaultSerialization() const +{ + return getObjectSerialization(schema_format); +} + +String DataTypeObject::doGetName() const +{ + WriteBufferFromOwnString out; + if (is_nullable) + out << "Object(Nullable(" << quote << schema_format << "))"; + else + out << "Object(" << quote << schema_format << ")"; + return out.str(); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Object data type family must have one argument - name of schema format"); + + ASTPtr schema_argument = arguments->children[0]; + bool is_nullable = false; + + if (const auto * func = schema_argument->as<ASTFunction>()) + { + if (func->name != "Nullable" || func->arguments->children.size() != 1) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Expected 'Nullable(<schema_name>)' as parameter for type Object (function: {})", func->name); + + schema_argument = func->arguments->children[0]; + is_nullable = true; + } + + const auto * literal = schema_argument->as<ASTLiteral>(); + if (!literal || literal->value.getType() != Field::Types::String) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Object data type family must have a const string as its schema name parameter"); + + return std::make_shared<DataTypeObject>(literal->value.get<const String &>(), is_nullable); +} + +void registerDataTypeObject(DataTypeFactory & factory) +{ + factory.registerDataType("Object", create); + factory.registerSimpleDataType("JSON", + [] { return std::make_shared<DataTypeObject>("JSON", false); }, + DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeObject.h b/contrib/clickhouse/src/DataTypes/DataTypeObject.h new file mode 100644 index 00000000000..2e1e5398f7e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeObject.h @@ -0,0 +1,49 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <Core/Field.h> +#include <Columns/ColumnObject.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class DataTypeObject : public IDataType +{ +private: + String schema_format; + bool is_nullable; + +public: + DataTypeObject(const String & schema_format_, bool is_nullable_); + + const char * getFamilyName() const override { return "Object"; } + String getSQLCompatibleName() const override { return "JSON"; } + String doGetName() const override; + TypeIndex getTypeId() const override { return TypeIndex::Object; } + + MutableColumnPtr createColumn() const override { return ColumnObject::create(is_nullable); } + + Field getDefault() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDefault() is not implemented for data type {}", getName()); + } + + bool haveSubtypes() const override { return false; } + bool equals(const IDataType & rhs) const override; + bool isParametric() const override { return true; } + bool hasDynamicSubcolumns() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; + + bool hasNullableSubcolumns() const { return is_nullable; } + + const String & getSchemaFormat() const { return schema_format; } +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeSet.h b/contrib/clickhouse/src/DataTypes/DataTypeSet.h new file mode 100644 index 00000000000..e71a345a195 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeSet.h @@ -0,0 +1,32 @@ +#pragma once + +#include <DataTypes/IDataTypeDummy.h> +#include <Columns/ColumnSet.h> + + +namespace DB +{ + +/** The data type corresponding to the set of values in the IN section. + * Used only as an intermediate when evaluating expressions. + */ +class DataTypeSet final : public IDataTypeDummy +{ +public: + static constexpr bool is_parametric = true; + const char * getFamilyName() const override { return "Set"; } + String getSQLCompatibleName() const override { return "TEXT"; } + + TypeIndex getTypeId() const override { return TypeIndex::Set; } + bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } + bool isParametric() const override { return true; } + + // Used for expressions analysis. + MutableColumnPtr createColumn() const override { return ColumnSet::create(0, nullptr); } + + // Used only for debugging, making it DUMPABLE + Field getDefault() const override { return Tuple(); } +}; + +} + diff --git a/contrib/clickhouse/src/DataTypes/DataTypeString.cpp b/contrib/clickhouse/src/DataTypes/DataTypeString.cpp new file mode 100644 index 00000000000..95e49420009 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeString.cpp @@ -0,0 +1,99 @@ +#include <Columns/ColumnString.h> +#include <Core/Field.h> + +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationString.h> + +#include <Parsers/IAST.h> +#include <Parsers/ASTLiteral.h> + +namespace DB +{ + + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNEXPECTED_AST_STRUCTURE; +} + +Field DataTypeString::getDefault() const +{ + return String(); +} + +MutableColumnPtr DataTypeString::createColumn() const +{ + return ColumnString::create(); +} + + +bool DataTypeString::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this); +} + +SerializationPtr DataTypeString::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationString>(); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (arguments && !arguments->children.empty()) + { + if (arguments->children.size() > 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "String data type family mustn't have more than one argument - size in characters"); + + const auto * argument = arguments->children[0]->as<ASTLiteral>(); + if (!argument || argument->value.getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "String data type family may have only a number (positive integer) as its argument"); + } + + return std::make_shared<DataTypeString>(); +} + + +void registerDataTypeString(DataTypeFactory & factory) +{ + factory.registerDataType("String", create); + + /// These synonims are added for compatibility. + + factory.registerAlias("CHAR", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NCHAR", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("CHARACTER", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("VARCHAR", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NVARCHAR", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("VARCHAR2", "String", DataTypeFactory::CaseInsensitive); /// Oracle + factory.registerAlias("TEXT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("TINYTEXT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("MEDIUMTEXT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("LONGTEXT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BLOB", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("CLOB", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("TINYBLOB", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("MEDIUMBLOB", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("LONGBLOB", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BYTEA", "String", DataTypeFactory::CaseInsensitive); /// PostgreSQL + + factory.registerAlias("CHARACTER LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("CHARACTER VARYING", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("CHAR LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("CHAR VARYING", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NATIONAL CHAR", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NATIONAL CHARACTER", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NATIONAL CHARACTER LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NATIONAL CHARACTER VARYING", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NATIONAL CHAR VARYING", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NCHAR VARYING", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NCHAR LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BINARY LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BINARY VARYING", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("VARBINARY", "String", DataTypeFactory::CaseInsensitive); + factory.registerAlias("GEOMETRY", "String", DataTypeFactory::CaseInsensitive); //mysql + +} +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeString.h b/contrib/clickhouse/src/DataTypes/DataTypeString.h new file mode 100644 index 00000000000..c39fa90f6e7 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeString.h @@ -0,0 +1,46 @@ +#pragma once + +#include <DataTypes/IDataType.h> + + +namespace DB +{ + +class ColumnString; + +class DataTypeString final : public IDataType +{ +public: + using FieldType = String; + using ColumnType = ColumnString; + static constexpr bool is_parametric = false; + static constexpr auto type_id = TypeIndex::String; + + const char * getFamilyName() const override + { + return "String"; + } + + String getSQLCompatibleName() const override { return "BLOB"; } + + TypeIndex getTypeId() const override { return type_id; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return false; } + bool haveSubtypes() const override { return false; } + bool isComparable() const override { return true; } + bool canBeComparedWithCollation() const override { return true; } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool isCategorial() const override { return true; } + bool canBeInsideNullable() const override { return true; } + bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeTuple.cpp b/contrib/clickhouse/src/DataTypes/DataTypeTuple.cpp new file mode 100644 index 00000000000..768f87fe3d4 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeTuple.cpp @@ -0,0 +1,372 @@ +#include <base/map.h> +#include <base/range.h> +#include <Common/StringUtils/StringUtils.h> +#include <Columns/ColumnTuple.h> +#include <Columns/ColumnConst.h> +#include <Core/Field.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationInfo.h> +#include <DataTypes/Serializations/SerializationTuple.h> +#include <DataTypes/Serializations/SerializationNamed.h> +#include <DataTypes/Serializations/SerializationInfoTuple.h> +#include <DataTypes/NestedUtils.h> +#include <Parsers/IAST.h> +#include <Parsers/ASTNameTypePair.h> +#include <Common/assert_cast.h> +#include <Common/quoteString.h> +#include <IO/WriteHelpers.h> +#include <IO/WriteBufferFromString.h> +#include <IO/Operators.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int DUPLICATE_COLUMN; + extern const int EMPTY_DATA_PASSED; + extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH; + extern const int ILLEGAL_INDEX; + extern const int LOGICAL_ERROR; +} + + +DataTypeTuple::DataTypeTuple(const DataTypes & elems_) + : elems(elems_), have_explicit_names(false) +{ + /// Automatically assigned names in form of '1', '2', ... + size_t size = elems.size(); + names.resize(size); + for (size_t i = 0; i < size; ++i) + names[i] = toString(i + 1); +} + +static std::optional<Exception> checkTupleNames(const Strings & names) +{ + std::unordered_set<String> names_set; + for (const auto & name : names) + { + if (name.empty()) + return Exception(ErrorCodes::BAD_ARGUMENTS, "Names of tuple elements cannot be empty"); + + if (!names_set.insert(name).second) + return Exception(ErrorCodes::DUPLICATE_COLUMN, "Names of tuple elements must be unique"); + } + + return {}; +} + +DataTypeTuple::DataTypeTuple(const DataTypes & elems_, const Strings & names_) + : elems(elems_), names(names_), have_explicit_names(true) +{ + size_t size = elems.size(); + if (names.size() != size) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of names passed to constructor of DataTypeTuple"); + + if (auto exception = checkTupleNames(names)) + throw std::move(*exception); +} + +std::string DataTypeTuple::doGetName() const +{ + size_t size = elems.size(); + WriteBufferFromOwnString s; + + s << "Tuple("; + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + s << ", "; + + if (have_explicit_names) + s << backQuoteIfNeed(names[i]) << ' '; + + s << elems[i]->getName(); + } + s << ")"; + + return s.str(); +} + + +static inline IColumn & extractElementColumn(IColumn & column, size_t idx) +{ + return assert_cast<ColumnTuple &>(column).getColumn(idx); +} + +template <typename F> +static void addElementSafe(const DataTypes & elems, IColumn & column, F && impl) +{ + /// We use the assumption that tuples of zero size do not exist. + size_t old_size = column.size(); + + try + { + impl(); + + // Check that all columns now have the same size. + size_t new_size = column.size(); + + for (auto i : collections::range(0, elems.size())) + { + const auto & element_column = extractElementColumn(column, i); + if (element_column.size() != new_size) + { + // This is not a logical error because it may work with + // user-supplied data. + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, + "Cannot read a tuple because not all elements are present"); + } + } + } + catch (...) + { + for (const auto & i : collections::range(0, elems.size())) + { + auto & element_column = extractElementColumn(column, i); + + if (element_column.size() > old_size) + element_column.popBack(1); + } + + throw; + } +} + +MutableColumnPtr DataTypeTuple::createColumn() const +{ + size_t size = elems.size(); + MutableColumns tuple_columns(size); + for (size_t i = 0; i < size; ++i) + tuple_columns[i] = elems[i]->createColumn(); + return ColumnTuple::create(std::move(tuple_columns)); +} + +MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const +{ + /// If we read subcolumn of nested Tuple, it may be wrapped to SerializationNamed + /// several times to allow to reconstruct the substream path name. + /// Here we don't need substream path name, so we drop first several wrapper serializations. + + const auto * current_serialization = &serialization; + while (const auto * serialization_named = typeid_cast<const SerializationNamed *>(current_serialization)) + current_serialization = serialization_named->getNested().get(); + + const auto * serialization_tuple = typeid_cast<const SerializationTuple *>(current_serialization); + if (!serialization_tuple) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected serialization to create column of type Tuple"); + + const auto & element_serializations = serialization_tuple->getElementsSerializations(); + + size_t size = elems.size(); + assert(element_serializations.size() == size); + MutableColumns tuple_columns(size); + for (size_t i = 0; i < size; ++i) + tuple_columns[i] = elems[i]->createColumn(*element_serializations[i]->getNested()); + + return ColumnTuple::create(std::move(tuple_columns)); +} + +Field DataTypeTuple::getDefault() const +{ + return Tuple(collections::map<Tuple>(elems, [] (const DataTypePtr & elem) { return elem->getDefault(); })); +} + +void DataTypeTuple::insertDefaultInto(IColumn & column) const +{ + addElementSafe(elems, column, [&] + { + for (const auto & i : collections::range(0, elems.size())) + elems[i]->insertDefaultInto(extractElementColumn(column, i)); + }); +} + +bool DataTypeTuple::equals(const IDataType & rhs) const +{ + if (typeid(rhs) != typeid(*this)) + return false; + + const DataTypeTuple & rhs_tuple = static_cast<const DataTypeTuple &>(rhs); + + size_t size = elems.size(); + if (size != rhs_tuple.elems.size()) + return false; + + for (size_t i = 0; i < size; ++i) + if (!elems[i]->equals(*rhs_tuple.elems[i]) || names[i] != rhs_tuple.names[i]) + return false; + + return true; +} + + +size_t DataTypeTuple::getPositionByName(const String & name) const +{ + size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + if (names[i] == name) + return i; + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}'", name); +} + +std::optional<size_t> DataTypeTuple::tryGetPositionByName(const String & name) const +{ + size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (names[i] == name) + { + return std::optional<size_t>(i); + } + } + return std::nullopt; +} + +String DataTypeTuple::getNameByPosition(size_t i) const +{ + if (i == 0 || i > names.size()) + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Index of tuple element ({}) if out range ([1, {}])", i, names.size()); + + return names[i - 1]; +} + + +bool DataTypeTuple::textCanContainOnlyValidUTF8() const +{ + return std::all_of(elems.begin(), elems.end(), [](auto && elem) { return elem->textCanContainOnlyValidUTF8(); }); +} + +bool DataTypeTuple::haveMaximumSizeOfValue() const +{ + return std::all_of(elems.begin(), elems.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); +} + +bool DataTypeTuple::hasDynamicSubcolumns() const +{ + return std::any_of(elems.begin(), elems.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); +} + +bool DataTypeTuple::isComparable() const +{ + return std::all_of(elems.begin(), elems.end(), [](auto && elem) { return elem->isComparable(); }); +} + +size_t DataTypeTuple::getMaximumSizeOfValueInMemory() const +{ + size_t res = 0; + for (const auto & elem : elems) + res += elem->getMaximumSizeOfValueInMemory(); + return res; +} + +size_t DataTypeTuple::getSizeOfValueInMemory() const +{ + size_t res = 0; + for (const auto & elem : elems) + res += elem->getSizeOfValueInMemory(); + return res; +} + +SerializationPtr DataTypeTuple::doGetDefaultSerialization() const +{ + SerializationTuple::ElementSerializations serializations(elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + { + String elem_name = have_explicit_names ? names[i] : toString(i + 1); + auto serialization = elems[i]->getDefaultSerialization(); + serializations[i] = std::make_shared<SerializationNamed>(serialization, elem_name); + } + + return std::make_shared<SerializationTuple>(std::move(serializations), have_explicit_names); +} + +SerializationPtr DataTypeTuple::getSerialization(const SerializationInfo & info) const +{ + SerializationTuple::ElementSerializations serializations(elems.size()); + const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(info); + + for (size_t i = 0; i < elems.size(); ++i) + { + String elem_name = have_explicit_names ? names[i] : toString(i + 1); + auto serialization = elems[i]->getSerialization(*info_tuple.getElementInfo(i)); + serializations[i] = std::make_shared<SerializationNamed>(serialization, elem_name); + } + + return std::make_shared<SerializationTuple>(std::move(serializations), have_explicit_names); +} + +MutableSerializationInfoPtr DataTypeTuple::createSerializationInfo(const SerializationInfo::Settings & settings) const +{ + MutableSerializationInfos infos; + infos.reserve(elems.size()); + for (const auto & elem : elems) + infos.push_back(elem->createSerializationInfo(settings)); + + return std::make_shared<SerializationInfoTuple>(std::move(infos), names, settings); +} + +SerializationInfoPtr DataTypeTuple::getSerializationInfo(const IColumn & column) const +{ + if (const auto * column_const = checkAndGetColumn<ColumnConst>(&column)) + return getSerializationInfo(column_const->getDataColumn()); + + MutableSerializationInfos infos; + infos.reserve(elems.size()); + + const auto & column_tuple = assert_cast<const ColumnTuple &>(column); + assert(elems.size() == column_tuple.getColumns().size()); + + for (size_t i = 0; i < elems.size(); ++i) + { + auto element_info = elems[i]->getSerializationInfo(column_tuple.getColumn(i)); + infos.push_back(const_pointer_cast<SerializationInfo>(element_info)); + } + + return std::make_shared<SerializationInfoTuple>(std::move(infos), names, SerializationInfo::Settings{}); +} + + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Tuple cannot be empty"); + + DataTypes nested_types; + nested_types.reserve(arguments->children.size()); + + Strings names; + names.reserve(arguments->children.size()); + + for (const ASTPtr & child : arguments->children) + { + if (const auto * name_and_type_pair = child->as<ASTNameTypePair>()) + { + nested_types.emplace_back(DataTypeFactory::instance().get(name_and_type_pair->type)); + names.emplace_back(name_and_type_pair->name); + } + else + nested_types.emplace_back(DataTypeFactory::instance().get(child)); + } + + if (names.empty()) + return std::make_shared<DataTypeTuple>(nested_types); + else if (names.size() != nested_types.size()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Names are specified not for all elements of Tuple type"); + else + return std::make_shared<DataTypeTuple>(nested_types, names); +} + + +void registerDataTypeTuple(DataTypeFactory & factory) +{ + factory.registerDataType("Tuple", create); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeTuple.h b/contrib/clickhouse/src/DataTypes/DataTypeTuple.h new file mode 100644 index 00000000000..0bf3f3ac8b3 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeTuple.h @@ -0,0 +1,76 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <optional> + + +namespace DB +{ + +/** Tuple data type. + * Used as an intermediate result when evaluating expressions. + * Also can be used as a column - the result of the query execution. + * + * Tuple elements can have names. + * If an element is unnamed, it will have automatically assigned name like '1', '2', '3' corresponding to its position. + * Manually assigned names must not begin with digit. Names must be unique. + * + * All tuples with same size and types of elements are equivalent for expressions, regardless to names of elements. + */ +class DataTypeTuple final : public IDataType +{ +private: + DataTypes elems; + Strings names; + bool have_explicit_names; + +public: + static constexpr bool is_parametric = true; + + explicit DataTypeTuple(const DataTypes & elems); + DataTypeTuple(const DataTypes & elems, const Strings & names); + + TypeIndex getTypeId() const override { return TypeIndex::Tuple; } + std::string doGetName() const override; + const char * getFamilyName() const override { return "Tuple"; } + String getSQLCompatibleName() const override { return "JSON"; } + + bool canBeInsideNullable() const override { return false; } + bool supportsSparseSerialization() const override { return true; } + bool canBeInsideSparseColumns() const override { return false; } + + MutableColumnPtr createColumn() const override; + MutableColumnPtr createColumn(const ISerialization & serialization) const override; + + Field getDefault() const override; + void insertDefaultInto(IColumn & column) const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return !elems.empty(); } + bool isComparable() const override; + bool textCanContainOnlyValidUTF8() const override; + bool haveMaximumSizeOfValue() const override; + bool hasDynamicSubcolumns() const override; + size_t getMaximumSizeOfValueInMemory() const override; + size_t getSizeOfValueInMemory() const override; + + SerializationPtr doGetDefaultSerialization() const override; + SerializationPtr getSerialization(const SerializationInfo & info) const override; + MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const override; + SerializationInfoPtr getSerializationInfo(const IColumn & column) const override; + + const DataTypePtr & getElement(size_t i) const { return elems[i]; } + const DataTypes & getElements() const { return elems; } + const Strings & getElementNames() const { return names; } + + size_t getPositionByName(const String & name) const; + std::optional<size_t> tryGetPositionByName(const String & name) const; + String getNameByPosition(size_t i) const; + + bool haveExplicitNames() const { return have_explicit_names; } +}; + +} + diff --git a/contrib/clickhouse/src/DataTypes/DataTypeUUID.cpp b/contrib/clickhouse/src/DataTypes/DataTypeUUID.cpp new file mode 100644 index 00000000000..44182a700b4 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeUUID.cpp @@ -0,0 +1,34 @@ +#include <DataTypes/DataTypeUUID.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/Serializations/SerializationUUID.h> + + +namespace DB +{ + +bool DataTypeUUID::equals(const IDataType & rhs) const +{ + return typeid(rhs) == typeid(*this); +} + +SerializationPtr DataTypeUUID::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationUUID>(); +} + +Field DataTypeUUID::getDefault() const +{ + return UUID{}; +} + +MutableColumnPtr DataTypeUUID::createColumn() const +{ + return ColumnVector<UUID>::create(); +} + +void registerDataTypeUUID(DataTypeFactory & factory) +{ + factory.registerSimpleDataType("UUID", [] { return DataTypePtr(std::make_shared<DataTypeUUID>()); }); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypeUUID.h b/contrib/clickhouse/src/DataTypes/DataTypeUUID.h new file mode 100644 index 00000000000..8664c3bcfd1 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypeUUID.h @@ -0,0 +1,50 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <Columns/ColumnVector.h> +#include <Core/UUID.h> + + +namespace DB +{ + +class DataTypeUUID : public IDataType +{ +public: + static constexpr bool is_parametric = false; + + using FieldType = UUID; + using ColumnType = ColumnVector<UUID>; + static constexpr auto type_id = TypeIndex::UUID; + + const char * getFamilyName() const override { return "UUID"; } + String getSQLCompatibleName() const override { return "CHAR"; } + + TypeIndex getTypeId() const override { return type_id; } + + Field getDefault() const override; + + MutableColumnPtr createColumn() const override; + + bool isParametric() const override { return false; } + bool haveSubtypes() const override { return false; } + + bool equals(const IDataType & rhs) const override; + + bool canBeUsedInBitOperations() const override { return true; } + bool canBeInsideNullable() const override { return true; } + bool canBePromoted() const override { return false; } + bool shouldAlignRightInPrettyFormats() const override { return false; } + bool textCanContainOnlyValidUTF8() const override { return true; } + bool isComparable() const override { return true; } + bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; } + bool isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion() const override { return true; } + bool haveMaximumSizeOfValue() const override { return true; } + size_t getSizeOfValueInMemory() const override { return sizeof(UUID); } + bool isCategorial() const override { return true; } + bool canBeInsideLowCardinality() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypesDecimal.cpp b/contrib/clickhouse/src/DataTypes/DataTypesDecimal.cpp new file mode 100644 index 00000000000..fa044d4ac9c --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypesDecimal.cpp @@ -0,0 +1,131 @@ +#include <DataTypes/DataTypesDecimal.h> +#include <DataTypes/Serializations/SerializationDecimal.h> + +#include <Common/typeid_cast.h> +#include <Core/DecimalFunctions.h> +#include <DataTypes/DataTypeFactory.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/readDecimalText.h> +#include <Parsers/ASTLiteral.h> + +#include <type_traits> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int DECIMAL_OVERFLOW; +} + + +template <is_decimal T> +std::string DataTypeDecimal<T>::doGetName() const +{ + return fmt::format("Decimal({}, {})", this->precision, this->scale); +} + +template <is_decimal T> +std::string DataTypeDecimal<T>::getSQLCompatibleName() const +{ + return fmt::format("DECIMAL({}, {})", this->precision, this->scale); +} + +template <is_decimal T> +bool DataTypeDecimal<T>::equals(const IDataType & rhs) const +{ + if (auto * ptype = typeid_cast<const DataTypeDecimal<T> *>(&rhs)) + return this->scale == ptype->getScale(); + return false; +} + +template <is_decimal T> +DataTypePtr DataTypeDecimal<T>::promoteNumericType() const +{ + if (sizeof(T) <= sizeof(Decimal128)) + return std::make_shared<DataTypeDecimal<Decimal128>>(DataTypeDecimal<Decimal128>::maxPrecision(), this->scale); + else + return std::make_shared<DataTypeDecimal<Decimal256>>(DataTypeDecimal<Decimal256>::maxPrecision(), this->scale); +} + +template <is_decimal T> +T DataTypeDecimal<T>::parseFromString(const String & str) const +{ + ReadBufferFromMemory buf(str.data(), str.size()); + T x; + UInt32 unread_scale = this->scale; + readDecimalText(buf, x, this->precision, unread_scale, true); + + if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier<T>(unread_scale), x.value)) + throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow"); + + return x; +} + +template <is_decimal T> +SerializationPtr DataTypeDecimal<T>::doGetDefaultSerialization() const +{ + return std::make_shared<SerializationDecimal<T>>(this->precision, this->scale); +} + + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Decimal data type family must have exactly two arguments: precision and scale"); + + const auto * precision = arguments->children[0]->as<ASTLiteral>(); + const auto * scale = arguments->children[1]->as<ASTLiteral>(); + + if (!precision || precision->value.getType() != Field::Types::UInt64 || + !scale || !(scale->value.getType() == Field::Types::Int64 || scale->value.getType() == Field::Types::UInt64)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Decimal data type family must have two numbers as its arguments"); + + UInt64 precision_value = precision->value.get<UInt64>(); + UInt64 scale_value = scale->value.get<UInt64>(); + + return createDecimal<DataTypeDecimal>(precision_value, scale_value); +} + +template <typename T> +static DataTypePtr createExact(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Decimal32 | Decimal64 | Decimal128 | Decimal256 data type family must have exactly one arguments: scale"); + const auto * scale_arg = arguments->children[0]->as<ASTLiteral>(); + + if (!scale_arg || !(scale_arg->value.getType() == Field::Types::Int64 || scale_arg->value.getType() == Field::Types::UInt64)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Decimal32 | Decimal64 | Decimal128 | Decimal256 data type family must have a one number as its argument"); + + UInt64 precision = DecimalUtils::max_precision<T>; + UInt64 scale = scale_arg->value.get<UInt64>(); + + return createDecimal<DataTypeDecimal>(precision, scale); +} + +void registerDataTypeDecimal(DataTypeFactory & factory) +{ + factory.registerDataType("Decimal32", createExact<Decimal32>, DataTypeFactory::CaseInsensitive); + factory.registerDataType("Decimal64", createExact<Decimal64>, DataTypeFactory::CaseInsensitive); + factory.registerDataType("Decimal128", createExact<Decimal128>, DataTypeFactory::CaseInsensitive); + factory.registerDataType("Decimal256", createExact<Decimal256>, DataTypeFactory::CaseInsensitive); + + factory.registerDataType("Decimal", create, DataTypeFactory::CaseInsensitive); + factory.registerAlias("DEC", "Decimal", DataTypeFactory::CaseInsensitive); + factory.registerAlias("NUMERIC", "Decimal", DataTypeFactory::CaseInsensitive); + factory.registerAlias("FIXED", "Decimal", DataTypeFactory::CaseInsensitive); +} + +/// Explicit template instantiations. +template class DataTypeDecimal<Decimal32>; +template class DataTypeDecimal<Decimal64>; +template class DataTypeDecimal<Decimal128>; +template class DataTypeDecimal<Decimal256>; + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypesDecimal.h b/contrib/clickhouse/src/DataTypes/DataTypesDecimal.h new file mode 100644 index 00000000000..5e4cfab7928 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypesDecimal.h @@ -0,0 +1,270 @@ +#pragma once + +#include <base/arithmeticOverflow.h> +#include <base/extended_types.h> +#include <Common/typeid_cast.h> +#include <DataTypes/IDataType.h> +#include <DataTypes/DataTypeDecimalBase.h> +#include <DataTypes/DataTypeDateTime64.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int DECIMAL_OVERFLOW; + extern const int LOGICAL_ERROR; +} + +/// Implements Decimal(P, S), where P is precision, S is scale. +/// Maximum precisions for underlying types are: +/// Int32 9 +/// Int64 18 +/// Int128 38 +/// Int256 76 +/// Operation between two decimals leads to Decimal(P, S), where +/// P is one of (9, 18, 38, 76); equals to the maximum precision for the biggest underlying type of operands. +/// S is maximum scale of operands. The allowed valuas are [0, precision] +template <is_decimal T> +class DataTypeDecimal final : public DataTypeDecimalBase<T> +{ + using Base = DataTypeDecimalBase<T>; + +public: + using typename Base::FieldType; + using typename Base::ColumnType; + using Base::Base; + + static constexpr auto family_name = "Decimal"; + + const char * getFamilyName() const override { return family_name; } + String getSQLCompatibleName() const override; + + std::string doGetName() const override; + TypeIndex getTypeId() const override { return TypeToTypeIndex<T>; } + bool canBePromoted() const override { return true; } + DataTypePtr promoteNumericType() const override; + + bool equals(const IDataType & rhs) const override; + T parseFromString(const String & str) const; + SerializationPtr doGetDefaultSerialization() const override; +}; + +using DataTypeDecimal32 = DataTypeDecimal<Decimal32>; +using DataTypeDecimal64 = DataTypeDecimal<Decimal64>; +using DataTypeDecimal128 = DataTypeDecimal<Decimal128>; +using DataTypeDecimal256 = DataTypeDecimal<Decimal256>; + +template <typename T> +inline const DataTypeDecimal<T> * checkDecimal(const IDataType & data_type) +{ + return typeid_cast<const DataTypeDecimal<T> *>(&data_type); +} + +inline UInt32 getDecimalScale(const IDataType & data_type) +{ + if (const auto * decimal_type = checkDecimal<Decimal32>(data_type)) + return decimal_type->getScale(); + if (const auto * decimal_type = checkDecimal<Decimal64>(data_type)) + return decimal_type->getScale(); + if (const auto * decimal_type = checkDecimal<Decimal128>(data_type)) + return decimal_type->getScale(); + if (const auto * decimal_type = checkDecimal<Decimal256>(data_type)) + return decimal_type->getScale(); + if (const auto * date_time_type = typeid_cast<const DataTypeDateTime64 *>(&data_type)) + return date_time_type->getScale(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get decimal scale from type {}", data_type.getName()); +} + +inline UInt32 getDecimalPrecision(const IDataType & data_type) +{ + if (const auto * decimal_type = checkDecimal<Decimal32>(data_type)) + return decimal_type->getPrecision(); + if (const auto * decimal_type = checkDecimal<Decimal64>(data_type)) + return decimal_type->getPrecision(); + if (const auto * decimal_type = checkDecimal<Decimal128>(data_type)) + return decimal_type->getPrecision(); + if (const auto * decimal_type = checkDecimal<Decimal256>(data_type)) + return decimal_type->getPrecision(); + if (const auto * date_time_type = typeid_cast<const DataTypeDateTime64 *>(&data_type)) + return date_time_type->getPrecision(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get decimal precision from type {}", data_type.getName()); +} + +template <typename T> +inline UInt32 getDecimalScale(const DataTypeDecimal<T> & data_type) +{ + return data_type.getScale(); +} + +template <typename FromDataType, typename ToDataType, typename ReturnType = void> +requires (IsDataTypeDecimal<FromDataType> && IsDataTypeDecimal<ToDataType>) +inline ReturnType convertDecimalsImpl(const typename FromDataType::FieldType & value, UInt32 scale_from, UInt32 scale_to, typename ToDataType::FieldType & result) +{ + using FromFieldType = typename FromDataType::FieldType; + using ToFieldType = typename ToDataType::FieldType; + using MaxFieldType = std::conditional_t<(sizeof(FromFieldType) > sizeof(ToFieldType)), FromFieldType, ToFieldType>; + using MaxNativeType = typename MaxFieldType::NativeType; + + static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; + + MaxNativeType converted_value; + if (scale_to > scale_from) + { + converted_value = DecimalUtils::scaleMultiplier<MaxNativeType>(scale_to - scale_from); + if (common::mulOverflow(static_cast<MaxNativeType>(value.value), converted_value, converted_value)) + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "{} convert overflow while multiplying {} by scale {}", + std::string(ToDataType::family_name), toString(value.value), toString(converted_value)); + else + return ReturnType(false); + } + } + else if (scale_to == scale_from) + { + converted_value = value.value; + } + else + { + converted_value = value.value / DecimalUtils::scaleMultiplier<MaxNativeType>(scale_from - scale_to); + } + + if constexpr (sizeof(FromFieldType) > sizeof(ToFieldType)) + { + if (converted_value < std::numeric_limits<typename ToFieldType::NativeType>::min() || + converted_value > std::numeric_limits<typename ToFieldType::NativeType>::max()) + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "{} convert overflow: {} is not in range ({}, {})", + std::string(ToDataType::family_name), toString(converted_value), + toString(std::numeric_limits<typename ToFieldType::NativeType>::min()), + toString(std::numeric_limits<typename ToFieldType::NativeType>::max())); + else + return ReturnType(false); + } + } + + result = static_cast<typename ToFieldType::NativeType>(converted_value); + + return ReturnType(true); +} + +template <typename FromDataType, typename ToDataType> +requires (IsDataTypeDecimal<FromDataType> && IsDataTypeDecimal<ToDataType>) +inline typename ToDataType::FieldType convertDecimals(const typename FromDataType::FieldType & value, UInt32 scale_from, UInt32 scale_to) +{ + using ToFieldType = typename ToDataType::FieldType; + ToFieldType result; + + convertDecimalsImpl<FromDataType, ToDataType, void>(value, scale_from, scale_to, result); + + return result; +} + +template <typename FromDataType, typename ToDataType> +requires (IsDataTypeDecimal<FromDataType> && IsDataTypeDecimal<ToDataType>) +inline bool tryConvertDecimals(const typename FromDataType::FieldType & value, UInt32 scale_from, UInt32 scale_to, typename ToDataType::FieldType & result) +{ + return convertDecimalsImpl<FromDataType, ToDataType, bool>(value, scale_from, scale_to, result); +} + +template <typename FromDataType, typename ToDataType, typename ReturnType> +requires (IsDataTypeDecimal<FromDataType> && is_arithmetic_v<typename ToDataType::FieldType>) +inline ReturnType convertFromDecimalImpl(const typename FromDataType::FieldType & value, UInt32 scale, typename ToDataType::FieldType& result) +{ + using FromFieldType = typename FromDataType::FieldType; + using ToFieldType = typename ToDataType::FieldType; + + return DecimalUtils::convertToImpl<ToFieldType, FromFieldType, ReturnType>(value, scale, result); +} + +template <typename FromDataType, typename ToDataType> +requires (IsDataTypeDecimal<FromDataType> && is_arithmetic_v<typename ToDataType::FieldType>) +inline typename ToDataType::FieldType convertFromDecimal(const typename FromDataType::FieldType & value, UInt32 scale) +{ + typename ToDataType::FieldType result; + + convertFromDecimalImpl<FromDataType, ToDataType, void>(value, scale, result); + + return result; +} + +template <typename FromDataType, typename ToDataType> +requires (IsDataTypeDecimal<FromDataType> && is_arithmetic_v<typename ToDataType::FieldType>) +inline bool tryConvertFromDecimal(const typename FromDataType::FieldType & value, UInt32 scale, typename ToDataType::FieldType& result) +{ + return convertFromDecimalImpl<FromDataType, ToDataType, bool>(value, scale, result); +} + +template <typename FromDataType, typename ToDataType, typename ReturnType> +requires (is_arithmetic_v<typename FromDataType::FieldType> && IsDataTypeDecimal<ToDataType>) +inline ReturnType convertToDecimalImpl(const typename FromDataType::FieldType & value, UInt32 scale, typename ToDataType::FieldType& result) +{ + using FromFieldType = typename FromDataType::FieldType; + using ToFieldType = typename ToDataType::FieldType; + using ToNativeType = typename ToFieldType::NativeType; + + static constexpr bool throw_exception = std::is_same_v<ReturnType, void>; + + if constexpr (std::is_floating_point_v<FromFieldType>) + { + if (!std::isfinite(value)) + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "{} convert overflow. Cannot convert infinity or NaN to decimal", ToDataType::family_name); + else + return ReturnType(false); + } + + auto out = value * static_cast<FromFieldType>(DecimalUtils::scaleMultiplier<ToNativeType>(scale)); + + if (out <= static_cast<FromFieldType>(std::numeric_limits<ToNativeType>::min()) || + out >= static_cast<FromFieldType>(std::numeric_limits<ToNativeType>::max())) + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "{} convert overflow. Float is out of Decimal range", ToDataType::family_name); + else + return ReturnType(false); + } + + result = static_cast<ToNativeType>(out); + return ReturnType(true); + } + else + { + if constexpr (is_big_int_v<FromFieldType>) + return ReturnType(convertDecimalsImpl<DataTypeDecimal<Decimal256>, ToDataType, ReturnType>(static_cast<Int256>(value), 0, scale, result)); + else if constexpr (std::is_same_v<FromFieldType, UInt64>) + return ReturnType(convertDecimalsImpl<DataTypeDecimal<Decimal128>, ToDataType, ReturnType>(static_cast<Int128>(value), 0, scale, result)); + else + return ReturnType(convertDecimalsImpl<DataTypeDecimal<Decimal64>, ToDataType, ReturnType>(static_cast<Int64>(value), 0, scale, result)); + } +} + +template <typename FromDataType, typename ToDataType> +requires (is_arithmetic_v<typename FromDataType::FieldType> && IsDataTypeDecimal<ToDataType>) +inline typename ToDataType::FieldType convertToDecimal(const typename FromDataType::FieldType & value, UInt32 scale) +{ + typename ToDataType::FieldType result; + convertToDecimalImpl<FromDataType, ToDataType, void>(value, scale, result); + return result; +} + +template <typename FromDataType, typename ToDataType> +requires (is_arithmetic_v<typename FromDataType::FieldType> && IsDataTypeDecimal<ToDataType>) +inline bool tryConvertToDecimal(const typename FromDataType::FieldType & value, UInt32 scale, typename ToDataType::FieldType& result) +{ + return convertToDecimalImpl<FromDataType, ToDataType, bool>(value, scale, result); +} + +template <typename T> +inline DataTypePtr createDecimalMaxPrecision(UInt64 scale) +{ + return std::make_shared<DataTypeDecimal<T>>(DecimalUtils::max_precision<T>, scale); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypesNumber.cpp b/contrib/clickhouse/src/DataTypes/DataTypesNumber.cpp new file mode 100644 index 00000000000..232a5101cbe --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypesNumber.cpp @@ -0,0 +1,98 @@ +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/DataTypeFactory.h> + + +#include <Parsers/IAST.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +template <typename T> +static DataTypePtr createNumericDataType(const ASTPtr & arguments) +{ + if (arguments) + { + if (std::is_integral_v<T>) + { + if (arguments->children.size() > 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "{} data type family must not have more than one argument - display width", TypeName<T>); + } + else + { + if (arguments->children.size() > 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "{} data type family must not have more than two arguments - total number " + "of digits and number of digits following the decimal point", TypeName<T>); + } + } + return std::make_shared<DataTypeNumber<T>>(); +} + + +void registerDataTypeNumbers(DataTypeFactory & factory) +{ + factory.registerDataType("UInt8", createNumericDataType<UInt8>); + factory.registerDataType("UInt16", createNumericDataType<UInt16>); + factory.registerDataType("UInt32", createNumericDataType<UInt32>); + factory.registerDataType("UInt64", createNumericDataType<UInt64>); + + factory.registerDataType("Int8", createNumericDataType<Int8>); + factory.registerDataType("Int16", createNumericDataType<Int16>); + factory.registerDataType("Int32", createNumericDataType<Int32>); + factory.registerDataType("Int64", createNumericDataType<Int64>); + + factory.registerDataType("Float32", createNumericDataType<Float32>); + factory.registerDataType("Float64", createNumericDataType<Float64>); + + factory.registerSimpleDataType("UInt128", [] { return DataTypePtr(std::make_shared<DataTypeUInt128>()); }); + factory.registerSimpleDataType("UInt256", [] { return DataTypePtr(std::make_shared<DataTypeUInt256>()); }); + + factory.registerSimpleDataType("Int128", [] { return DataTypePtr(std::make_shared<DataTypeInt128>()); }); + factory.registerSimpleDataType("Int256", [] { return DataTypePtr(std::make_shared<DataTypeInt256>()); }); + + /// These synonyms are added for compatibility. + + factory.registerAlias("TINYINT", "Int8", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INT1", "Int8", DataTypeFactory::CaseInsensitive); /// MySQL + factory.registerAlias("BYTE", "Int8", DataTypeFactory::CaseInsensitive); /// MS Access + factory.registerAlias("SMALLINT", "Int16", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INT", "Int32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INTEGER", "Int32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BIGINT", "Int64", DataTypeFactory::CaseInsensitive); + factory.registerAlias("FLOAT", "Float32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("REAL", "Float32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("SINGLE", "Float32", DataTypeFactory::CaseInsensitive); /// MS Access + factory.registerAlias("DOUBLE", "Float64", DataTypeFactory::CaseInsensitive); + factory.registerAlias("MEDIUMINT", "Int32", DataTypeFactory::CaseInsensitive); /// MySQL + + factory.registerAlias("DOUBLE PRECISION", "Float64", DataTypeFactory::CaseInsensitive); + + /// MySQL + factory.registerAlias("TINYINT SIGNED", "Int8", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INT1 SIGNED", "Int8", DataTypeFactory::CaseInsensitive); + factory.registerAlias("SMALLINT SIGNED", "Int16", DataTypeFactory::CaseInsensitive); + factory.registerAlias("MEDIUMINT SIGNED", "Int32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INT SIGNED", "Int32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INTEGER SIGNED", "Int32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BIGINT SIGNED", "Int64", DataTypeFactory::CaseInsensitive); + factory.registerAlias("TINYINT UNSIGNED", "UInt8", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INT1 UNSIGNED", "UInt8", DataTypeFactory::CaseInsensitive); + factory.registerAlias("SMALLINT UNSIGNED", "UInt16", DataTypeFactory::CaseInsensitive); + factory.registerAlias("MEDIUMINT UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INT UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("INTEGER UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BIGINT UNSIGNED", "UInt64", DataTypeFactory::CaseInsensitive); + factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL + factory.registerAlias("SET", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL + factory.registerAlias("YEAR", "UInt16", DataTypeFactory::CaseInsensitive); + factory.registerAlias("TIME", "Int64", DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/DataTypesNumber.h b/contrib/clickhouse/src/DataTypes/DataTypesNumber.h new file mode 100644 index 00000000000..5843086248c --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/DataTypesNumber.h @@ -0,0 +1,53 @@ +#pragma once + +#include <type_traits> +#include <Core/Field.h> +#include <DataTypes/DataTypeNumberBase.h> +#include <DataTypes/Serializations/SerializationNumber.h> + + +namespace DB +{ + +template <typename T> +class DataTypeNumber final : public DataTypeNumberBase<T> +{ +public: + bool equals(const IDataType & rhs) const override { return typeid(rhs) == typeid(*this); } + + bool canBeUsedAsVersion() const override { return true; } + bool isSummable() const override { return true; } + bool canBeUsedInBitOperations() const override { return true; } + bool canBeUsedInBooleanContext() const override { return true; } + bool canBeInsideNullable() const override { return true; } + + bool canBePromoted() const override { return true; } + DataTypePtr promoteNumericType() const override + { + using PromotedType = DataTypeNumber<NearestFieldType<T>>; + return std::make_shared<PromotedType>(); + } + + SerializationPtr doGetDefaultSerialization() const override + { + return std::make_shared<SerializationNumber<T>>(); + } +}; + +using DataTypeUInt8 = DataTypeNumber<UInt8>; +using DataTypeUInt16 = DataTypeNumber<UInt16>; +using DataTypeUInt32 = DataTypeNumber<UInt32>; +using DataTypeUInt64 = DataTypeNumber<UInt64>; +using DataTypeInt8 = DataTypeNumber<Int8>; +using DataTypeInt16 = DataTypeNumber<Int16>; +using DataTypeInt32 = DataTypeNumber<Int32>; +using DataTypeInt64 = DataTypeNumber<Int64>; +using DataTypeFloat32 = DataTypeNumber<Float32>; +using DataTypeFloat64 = DataTypeNumber<Float64>; + +using DataTypeUInt128 = DataTypeNumber<UInt128>; +using DataTypeInt128 = DataTypeNumber<Int128>; +using DataTypeUInt256 = DataTypeNumber<UInt256>; +using DataTypeInt256 = DataTypeNumber<Int256>; + +} diff --git a/contrib/clickhouse/src/DataTypes/EnumValues.cpp b/contrib/clickhouse/src/DataTypes/EnumValues.cpp new file mode 100644 index 00000000000..9df49e765a7 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/EnumValues.cpp @@ -0,0 +1,107 @@ +#include <DataTypes/EnumValues.h> +#include <boost/algorithm/string.hpp> +#include <base/sort.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; + extern const int EMPTY_DATA_PASSED; + extern const int UNKNOWN_ELEMENT_OF_ENUM; +} + +template <typename T> +EnumValues<T>::EnumValues(const Values & values_) + : values(values_) +{ + if (values.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "DataTypeEnum enumeration cannot be empty"); + + ::sort(std::begin(values), std::end(values), [] (auto & left, auto & right) + { + return left.second < right.second; + }); + + fillMaps(); +} + +template <typename T> +void EnumValues<T>::fillMaps() +{ + for (const auto & name_and_value : values) + { + const auto inserted_value = name_to_value_map.insert( + { StringRef{name_and_value.first}, name_and_value.second }); + + if (!inserted_value.second) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Duplicate names in enum: '{}' = {} and {}", + name_and_value.first, toString(name_and_value.second), toString(inserted_value.first->getMapped())); + + const auto inserted_name = value_to_name_map.insert( + { name_and_value.second, StringRef{name_and_value.first} }); + + if (!inserted_name.second) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Duplicate values in enum: '{}' = {} and '{}'", + name_and_value.first, toString(name_and_value.second), toString((*inserted_name.first).first)); + } +} + +template <typename T> +T EnumValues<T>::getValue(StringRef field_name, bool try_treat_as_id) const +{ + const auto it = name_to_value_map.find(field_name); + if (!it) + { + /// It is used in CSV and TSV input formats. If we fail to find given string in + /// enum names, we will try to treat it as enum id. + if (try_treat_as_id) + { + T x; + ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); + readText(x, tmp_buf); + /// Check if we reached end of the tmp_buf (otherwise field_name is not a number) + /// and try to find it in enum ids + if (tmp_buf.eof() && value_to_name_map.find(x) != value_to_name_map.end()) + return x; + } + auto hints = this->getHints(field_name.toString()); + auto hints_string = !hints.empty() ? ", maybe you meant: " + toString(hints) : ""; + throw Exception(ErrorCodes::UNKNOWN_ELEMENT_OF_ENUM, "Unknown element '{}' for enum{}", field_name.toString(), hints_string); + } + return it->getMapped(); +} + +template <typename T> +Names EnumValues<T>::getAllRegisteredNames() const +{ + Names result; + for (const auto & value : values) + result.emplace_back(value.first); + return result; +} + +template <typename T> +std::unordered_set<String> EnumValues<T>::getSetOfAllNames(bool to_lower) const +{ + std::unordered_set<String> result; + for (const auto & value : values) + result.insert(to_lower ? boost::algorithm::to_lower_copy(value.first) : value.first); + return result; +} + +template <typename T> +std::unordered_set<T> EnumValues<T>::getSetOfAllValues() const +{ + std::unordered_set<T> result; + for (const auto & value : values) + result.insert(value.second); + return result; +} + +template class EnumValues<Int8>; +template class EnumValues<Int16>; + +} diff --git a/contrib/clickhouse/src/DataTypes/EnumValues.h b/contrib/clickhouse/src/DataTypes/EnumValues.h new file mode 100644 index 00000000000..2e6628adcf3 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/EnumValues.h @@ -0,0 +1,90 @@ +#pragma once + +#include <unordered_map> +#include <Common/HashTable/HashMap.h> +#include <Common/NamePrompter.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +template <typename T> +class EnumValues : public IHints<1, EnumValues<T>> +{ +public: + using Value = std::pair<std::string, T>; + using Values = std::vector<Value>; + using NameToValueMap = HashMap<StringRef, T, StringRefHash>; + using ValueToNameMap = std::unordered_map<T, StringRef>; + +private: + Values values; + NameToValueMap name_to_value_map; + ValueToNameMap value_to_name_map; + + void fillMaps(); + +public: + explicit EnumValues(const Values & values_); + + const Values & getValues() const { return values; } + + auto findByValue(const T & value) const + { + const auto it = value_to_name_map.find(value); + if (it == std::end(value_to_name_map)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected value {} in enum", toString(value)); + + return it; + } + + /// throws exception if value is not valid + const StringRef & getNameForValue(const T & value) const + { + return findByValue(value)->second; + } + + /// returns false if value is not valid + bool getNameForValue(const T & value, StringRef & result) const + { + const auto it = value_to_name_map.find(value); + if (it == std::end(value_to_name_map)) + return false; + + result = it->second; + return true; + } + + T getValue(StringRef field_name, bool try_treat_as_id = false) const; + + template <typename TValues> + bool containsAll(const TValues & rhs_values) const + { + auto check = [&](const auto & value) + { + auto it = name_to_value_map.find(value.first); + /// If we don't have this name, than we have to be sure, + /// that this value exists in enum + if (it == name_to_value_map.end()) + return value_to_name_map.count(value.second) > 0; + + /// If we have this name, than it should have the same value + return it->value.second == value.second; + }; + + return std::all_of(rhs_values.begin(), rhs_values.end(), check); + } + + Names getAllRegisteredNames() const override; + + std::unordered_set<String> getSetOfAllNames(bool to_lower) const; + + std::unordered_set<T> getSetOfAllValues() const; +}; + +} + diff --git a/contrib/clickhouse/src/DataTypes/FieldToDataType.cpp b/contrib/clickhouse/src/DataTypes/FieldToDataType.cpp new file mode 100644 index 00000000000..210dab9921e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/FieldToDataType.cpp @@ -0,0 +1,211 @@ +#include <DataTypes/FieldToDataType.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeObject.h> +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/DataTypesDecimal.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/DataTypeUUID.h> +#include <DataTypes/DataTypeIPv4andIPv6.h> +#include <DataTypes/getLeastSupertype.h> +#include <DataTypes/DataTypeFactory.h> +#include <Common/Exception.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int EMPTY_DATA_PASSED; + extern const int NOT_IMPLEMENTED; +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Null &) const +{ + return std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>()); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const UInt64 & x) const +{ + if (x <= std::numeric_limits<UInt8>::max()) return std::make_shared<DataTypeUInt8>(); + if (x <= std::numeric_limits<UInt16>::max()) return std::make_shared<DataTypeUInt16>(); + if (x <= std::numeric_limits<UInt32>::max()) return std::make_shared<DataTypeUInt32>(); + return std::make_shared<DataTypeUInt64>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Int64 & x) const +{ + if (x <= std::numeric_limits<Int8>::max() && x >= std::numeric_limits<Int8>::min()) return std::make_shared<DataTypeInt8>(); + if (x <= std::numeric_limits<Int16>::max() && x >= std::numeric_limits<Int16>::min()) return std::make_shared<DataTypeInt16>(); + if (x <= std::numeric_limits<Int32>::max() && x >= std::numeric_limits<Int32>::min()) return std::make_shared<DataTypeInt32>(); + return std::make_shared<DataTypeInt64>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Float64 &) const +{ + return std::make_shared<DataTypeFloat64>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const UInt128 &) const +{ + return std::make_shared<DataTypeUInt128>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Int128 &) const +{ + return std::make_shared<DataTypeInt128>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const UInt256 &) const +{ + return std::make_shared<DataTypeUInt256>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Int256 &) const +{ + return std::make_shared<DataTypeInt256>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const UUID &) const +{ + return std::make_shared<DataTypeUUID>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const IPv4 &) const +{ + return std::make_shared<DataTypeIPv4>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const IPv6 &) const +{ + return std::make_shared<DataTypeIPv6>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const String &) const +{ + return std::make_shared<DataTypeString>(); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const DecimalField<Decimal32> & x) const +{ + using Type = DataTypeDecimal<Decimal32>; + return std::make_shared<Type>(Type::maxPrecision(), x.getScale()); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const DecimalField<Decimal64> & x) const +{ + using Type = DataTypeDecimal<Decimal64>; + return std::make_shared<Type>(Type::maxPrecision(), x.getScale()); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const DecimalField<Decimal128> & x) const +{ + using Type = DataTypeDecimal<Decimal128>; + return std::make_shared<Type>(Type::maxPrecision(), x.getScale()); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const DecimalField<Decimal256> & x) const +{ + using Type = DataTypeDecimal<Decimal256>; + return std::make_shared<Type>(Type::maxPrecision(), x.getScale()); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Array & x) const +{ + DataTypes element_types; + element_types.reserve(x.size()); + + for (const Field & elem : x) + element_types.emplace_back(applyVisitor(*this, elem)); + + return std::make_shared<DataTypeArray>(getLeastSupertype<on_error>(element_types)); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Tuple & tuple) const +{ + if (tuple.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot infer type of an empty tuple"); + + DataTypes element_types; + element_types.reserve(tuple.size()); + + for (const auto & element : tuple) + element_types.push_back(applyVisitor(*this, element)); + + return std::make_shared<DataTypeTuple>(element_types); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Map & map) const +{ + DataTypes key_types; + DataTypes value_types; + key_types.reserve(map.size()); + value_types.reserve(map.size()); + + for (const auto & elem : map) + { + const auto & tuple = elem.safeGet<const Tuple &>(); + assert(tuple.size() == 2); + key_types.push_back(applyVisitor(*this, tuple[0])); + value_types.push_back(applyVisitor(*this, tuple[1])); + } + + return std::make_shared<DataTypeMap>( + getLeastSupertype<on_error>(key_types), + getLeastSupertype<on_error>(value_types)); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const Object &) const +{ + /// TODO: Do we need different parameters for type Object? + return std::make_shared<DataTypeObject>("json", false); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const AggregateFunctionStateData & x) const +{ + const auto & name = static_cast<const AggregateFunctionStateData &>(x).name; + return DataTypeFactory::instance().get(name); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator() (const CustomType &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented"); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr FieldToDataType<on_error>::operator()(const bool &) const +{ + return DataTypeFactory::instance().get("Bool"); +} + +template class FieldToDataType<LeastSupertypeOnError::Throw>; +template class FieldToDataType<LeastSupertypeOnError::String>; +template class FieldToDataType<LeastSupertypeOnError::Null>; + +} diff --git a/contrib/clickhouse/src/DataTypes/FieldToDataType.h b/contrib/clickhouse/src/DataTypes/FieldToDataType.h new file mode 100644 index 00000000000..8febadc1a0d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/FieldToDataType.h @@ -0,0 +1,50 @@ +#pragma once + +#include <memory> +#include <Core/Types.h> +#include <Core/Field.h> +#include <Common/FieldVisitors.h> +#include <DataTypes/getLeastSupertype.h> + + +namespace DB +{ + +class IDataType; +using DataTypePtr = std::shared_ptr<const IDataType>; + + +/** For a given Field returns the minimum data type that allows this value to be stored. + * Note that you still have to convert Field to corresponding data type before inserting to columns + * (for example, this is necessary to convert elements of Array to common type). + */ +template <LeastSupertypeOnError on_error = LeastSupertypeOnError::Throw> +class FieldToDataType : public StaticVisitor<DataTypePtr> +{ +public: + DataTypePtr operator() (const Null & x) const; + DataTypePtr operator() (const UInt64 & x) const; + DataTypePtr operator() (const UInt128 & x) const; + DataTypePtr operator() (const Int64 & x) const; + DataTypePtr operator() (const Int128 & x) const; + DataTypePtr operator() (const UUID & x) const; + DataTypePtr operator() (const IPv4 & x) const; + DataTypePtr operator() (const IPv6 & x) const; + DataTypePtr operator() (const Float64 & x) const; + DataTypePtr operator() (const String & x) const; + DataTypePtr operator() (const Array & x) const; + DataTypePtr operator() (const Tuple & tuple) const; + DataTypePtr operator() (const Map & map) const; + DataTypePtr operator() (const Object & map) const; + DataTypePtr operator() (const DecimalField<Decimal32> & x) const; + DataTypePtr operator() (const DecimalField<Decimal64> & x) const; + DataTypePtr operator() (const DecimalField<Decimal128> & x) const; + DataTypePtr operator() (const DecimalField<Decimal256> & x) const; + DataTypePtr operator() (const AggregateFunctionStateData & x) const; + DataTypePtr operator() (const CustomType & x) const; + DataTypePtr operator() (const UInt256 & x) const; + DataTypePtr operator() (const Int256 & x) const; + DataTypePtr operator() (const bool & x) const; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/IDataType.cpp b/contrib/clickhouse/src/DataTypes/IDataType.cpp new file mode 100644 index 00000000000..4ffe82039b2 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/IDataType.cpp @@ -0,0 +1,254 @@ +#include <cstddef> +#include <Columns/IColumn.h> +#include <Columns/ColumnConst.h> +#include <Columns/ColumnSparse.h> + +#include <Common/Exception.h> +#include <Common/SipHash.h> + +#include <IO/WriteHelpers.h> +#include <IO/Operators.h> + +#include <DataTypes/IDataType.h> +#include <DataTypes/DataTypeCustom.h> +#include <DataTypes/NestedUtils.h> +#include <DataTypes/Serializations/SerializationSparse.h> +#include <DataTypes/Serializations/SerializationInfo.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int DATA_TYPE_CANNOT_BE_PROMOTED; + extern const int ILLEGAL_COLUMN; +} + +IDataType::~IDataType() = default; + +void IDataType::updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint) +{ + /// Update the average value size hint if amount of read rows isn't too small + size_t column_size = column.size(); + if (column_size > 10) + { + double current_avg_value_size = static_cast<double>(column.byteSize()) / column_size; + + /// Heuristic is chosen so that avg_value_size_hint increases rapidly but decreases slowly. + if (current_avg_value_size > avg_value_size_hint) + avg_value_size_hint = std::min(1024., current_avg_value_size); /// avoid overestimation + else if (current_avg_value_size * 2 < avg_value_size_hint) + avg_value_size_hint = (current_avg_value_size + avg_value_size_hint * 3) / 4; + } +} + +MutableColumnPtr IDataType::createColumn(const ISerialization & serialization) const +{ + auto column = createColumn(); + if (serialization.getKind() == ISerialization::Kind::SPARSE) + return ColumnSparse::create(std::move(column)); + + return column; +} + +ColumnPtr IDataType::createColumnConst(size_t size, const Field & field) const +{ + auto column = createColumn(); + column->insert(field); + return ColumnConst::create(std::move(column), size); +} + + +ColumnPtr IDataType::createColumnConstWithDefaultValue(size_t size) const +{ + return createColumnConst(size, getDefault()); +} + +DataTypePtr IDataType::promoteNumericType() const +{ + throw Exception(ErrorCodes::DATA_TYPE_CANNOT_BE_PROMOTED, "Data type {} can't be promoted.", getName()); +} + +size_t IDataType::getSizeOfValueInMemory() const +{ + throw Exception(ErrorCodes::LOGICAL_ERROR, "Value of type {} in memory is not of fixed size.", getName()); +} + +void IDataType::forEachSubcolumn( + const SubcolumnCallback & callback, + const SubstreamData & data) +{ + ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) + { + for (size_t i = 0; i < subpath.size(); ++i) + { + size_t prefix_len = i + 1; + if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len)) + { + auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); + auto subdata = ISerialization::createFromPath(subpath, prefix_len); + callback(subpath, name, subdata); + } + subpath[i].visited = true; + } + }; + + ISerialization::EnumerateStreamsSettings settings; + settings.position_independent_encoding = false; + data.serialization->enumerateStreams(settings, callback_with_data, data); +} + +template <typename Ptr> +Ptr IDataType::getForSubcolumn( + std::string_view subcolumn_name, + const SubstreamData & data, + Ptr SubstreamData::*member, + bool throw_if_null) const +{ + Ptr res; + forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata) + { + if (name == subcolumn_name) + res = subdata.*member; + }, data); + + if (!res && throw_if_null) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + + return res; +} + +bool IDataType::hasSubcolumn(std::string_view subcolumn_name) const +{ + return tryGetSubcolumnType(subcolumn_name) != nullptr; +} + +DataTypePtr IDataType::tryGetSubcolumnType(std::string_view subcolumn_name) const +{ + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); + return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false); +} + +DataTypePtr IDataType::getSubcolumnType(std::string_view subcolumn_name) const +{ + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); + return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, true); +} + +ColumnPtr IDataType::tryGetSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const +{ + auto data = SubstreamData(getDefaultSerialization()).withColumn(column); + return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, false); +} + +ColumnPtr IDataType::getSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const +{ + auto data = SubstreamData(getDefaultSerialization()).withColumn(column); + return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, true); +} + +SerializationPtr IDataType::getSubcolumnSerialization(std::string_view subcolumn_name, const SerializationPtr & serialization) const +{ + auto data = SubstreamData(serialization); + return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization, true); +} + +Names IDataType::getSubcolumnNames() const +{ + Names res; + forEachSubcolumn([&](const auto &, const auto & name, const auto &) + { + res.push_back(name); + }, SubstreamData(getDefaultSerialization())); + return res; +} + +void IDataType::insertDefaultInto(IColumn & column) const +{ + column.insertDefault(); +} + +void IDataType::insertManyDefaultsInto(IColumn & column, size_t n) const +{ + for (size_t i = 0; i < n; ++i) + insertDefaultInto(column); +} + +void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const +{ + /// replace only if not null + if (custom_desc_->name) + custom_name = std::move(custom_desc_->name); + + if (custom_desc_->serialization) + custom_serialization = std::move(custom_desc_->serialization); +} + +MutableSerializationInfoPtr IDataType::createSerializationInfo(const SerializationInfo::Settings & settings) const +{ + return std::make_shared<SerializationInfo>(ISerialization::Kind::DEFAULT, settings); +} + +SerializationInfoPtr IDataType::getSerializationInfo(const IColumn & column) const +{ + if (const auto * column_const = checkAndGetColumn<ColumnConst>(&column)) + return getSerializationInfo(column_const->getDataColumn()); + + return std::make_shared<SerializationInfo>(ISerialization::getKind(column), SerializationInfo::Settings{}); +} + +SerializationPtr IDataType::getDefaultSerialization() const +{ + if (custom_serialization) + return custom_serialization; + + return doGetDefaultSerialization(); +} + +SerializationPtr IDataType::getSparseSerialization() const +{ + return std::make_shared<SerializationSparse>(getDefaultSerialization()); +} + +SerializationPtr IDataType::getSerialization(ISerialization::Kind kind) const +{ + if (supportsSparseSerialization() && kind == ISerialization::Kind::SPARSE) + return getSparseSerialization(); + + return getDefaultSerialization(); +} + +SerializationPtr IDataType::getSerialization(const SerializationInfo & info) const +{ + return getSerialization(info.getKind()); +} + +// static +SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const SerializationInfo & info) +{ + if (column.isSubcolumn()) + { + const auto & type_in_storage = column.getTypeInStorage(); + auto serialization = type_in_storage->getSerialization(info); + return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization); + } + + return column.type->getSerialization(info); +} + +// static +SerializationPtr IDataType::getSerialization(const NameAndTypePair & column) +{ + if (column.isSubcolumn()) + { + const auto & type_in_storage = column.getTypeInStorage(); + auto serialization = type_in_storage->getDefaultSerialization(); + return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization); + } + + return column.type->getDefaultSerialization(); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/IDataType.h b/contrib/clickhouse/src/DataTypes/IDataType.h new file mode 100644 index 00000000000..54cb3d0d5c2 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/IDataType.h @@ -0,0 +1,649 @@ +#pragma once + +#include <memory> +#include <boost/noncopyable.hpp> +#include <Core/Names.h> +#include <Core/TypeId.h> +#include <Common/COW.h> +#include <DataTypes/DataTypeCustom.h> +#include <DataTypes/Serializations/ISerialization.h> +#include <DataTypes/Serializations/SerializationInfo.h> + + +namespace DB +{ + +class ReadBuffer; +class WriteBuffer; + +class IDataType; +struct FormatSettings; + +class IColumn; +using ColumnPtr = COW<IColumn>::Ptr; +using MutableColumnPtr = COW<IColumn>::MutablePtr; + +class Field; + +using DataTypePtr = std::shared_ptr<const IDataType>; +using DataTypes = std::vector<DataTypePtr>; + +struct NameAndTypePair; + +struct DataTypeWithConstInfo +{ + DataTypePtr type; + bool is_const; +}; + +using DataTypesWithConstInfo = std::vector<DataTypeWithConstInfo>; + +/** Properties of data type. + * + * Contains methods for getting serialization instances. + * One data type may have different serializations, which can be chosen + * dynamically before reading or writing, according to information about + * column content (see `getSerialization` methods). + * + * Implementations of this interface represent a data type (example: UInt8) + * or parametric family of data types (example: Array(...)). + * + * DataType is totally immutable object. You can always share them. + */ +class IDataType : private boost::noncopyable, public std::enable_shared_from_this<IDataType> +{ +public: + IDataType() = default; + virtual ~IDataType(); + + /// Compile time flag. If false, then if C++ types are the same, then SQL types are also the same. + /// Example: DataTypeString is not parametric: thus all instances of DataTypeString are the same SQL type. + /// Example: DataTypeFixedString is parametric: different instances of DataTypeFixedString may be different SQL types. + /// Place it in descendants: + /// static constexpr bool is_parametric = false; + + /// Name of data type (examples: UInt64, Array(String)). + String getName() const + { + if (custom_name) + return custom_name->getName(); + else + return doGetName(); + } + + DataTypePtr getPtr() const { return shared_from_this(); } + + /// Name of data type family (example: FixedString, Array). + virtual const char * getFamilyName() const = 0; + /// Name of corresponding data type in MySQL (exampe: Bigint, Blob, etc) + virtual String getSQLCompatibleName() const = 0; + + /// Data type id. It's used for runtime type checks. + virtual TypeIndex getTypeId() const = 0; + + bool hasSubcolumn(std::string_view subcolumn_name) const; + + DataTypePtr tryGetSubcolumnType(std::string_view subcolumn_name) const; + DataTypePtr getSubcolumnType(std::string_view subcolumn_name) const; + + ColumnPtr tryGetSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const; + ColumnPtr getSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const; + + SerializationPtr getSubcolumnSerialization(std::string_view subcolumn_name, const SerializationPtr & serialization) const; + + using SubstreamData = ISerialization::SubstreamData; + using SubstreamPath = ISerialization::SubstreamPath; + + using SubcolumnCallback = std::function<void( + const SubstreamPath &, + const String &, + const SubstreamData &)>; + + static void forEachSubcolumn( + const SubcolumnCallback & callback, + const SubstreamData & data); + + Names getSubcolumnNames() const; + + virtual MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const; + virtual SerializationInfoPtr getSerializationInfo(const IColumn & column) const; + + /// TODO: support more types. + virtual bool supportsSparseSerialization() const { return !haveSubtypes(); } + virtual bool canBeInsideSparseColumns() const { return supportsSparseSerialization(); } + + SerializationPtr getDefaultSerialization() const; + SerializationPtr getSparseSerialization() const; + + /// Chooses serialization according to serialization kind. + SerializationPtr getSerialization(ISerialization::Kind kind) const; + + /// Chooses serialization according to collected information about content of column. + virtual SerializationPtr getSerialization(const SerializationInfo & info) const; + + /// Chooses between subcolumn serialization and regular serialization according to @column. + /// This method typically should be used to get serialization for reading column or subcolumn. + static SerializationPtr getSerialization(const NameAndTypePair & column, const SerializationInfo & info); + + static SerializationPtr getSerialization(const NameAndTypePair & column); + +protected: + virtual String doGetName() const { return getFamilyName(); } + virtual SerializationPtr doGetDefaultSerialization() const = 0; + +public: + /** Create empty column for corresponding type and default serialization. + */ + virtual MutableColumnPtr createColumn() const = 0; + + /** Create empty column for corresponding type and serialization. + */ + virtual MutableColumnPtr createColumn(const ISerialization & serialization) const; + + /** Create ColumnConst for corresponding type, with specified size and value. + */ + ColumnPtr createColumnConst(size_t size, const Field & field) const; + ColumnPtr createColumnConstWithDefaultValue(size_t size) const; + + /** Get default value of data type. + * It is the "default" default, regardless the fact that a table could contain different user-specified default. + */ + virtual Field getDefault() const = 0; + + /** The data type can be promoted in order to try to avoid overflows. + * Data types which can be promoted are typically Number or Decimal data types. + */ + virtual bool canBePromoted() const { return false; } + + /** Return the promoted numeric data type of the current data type. Throw an exception if `canBePromoted() == false`. + */ + virtual DataTypePtr promoteNumericType() const; + + /** Directly insert default value into a column. Default implementation use method IColumn::insertDefault. + * This should be overridden if data type default value differs from column default value (example: Enum data types). + */ + virtual void insertDefaultInto(IColumn & column) const; + + void insertManyDefaultsInto(IColumn & column, size_t n) const; + + /// Checks that two instances belong to the same type + virtual bool equals(const IDataType & rhs) const = 0; + + /// Various properties on behaviour of data type. + + /** The data type is dependent on parameters and types with different parameters are different. + * Examples: FixedString(N), Tuple(T1, T2), Nullable(T). + * Otherwise all instances of the same class are the same types. + */ + virtual bool isParametric() const = 0; + + /** The data type is dependent on parameters and at least one of them is another type. + * Examples: Tuple(T1, T2), Nullable(T). But FixedString(N) is not. + */ + virtual bool haveSubtypes() const = 0; + + /** Can appear in table definition. + * Counterexamples: Interval, Nothing. + */ + virtual bool cannotBeStoredInTables() const { return false; } + + /** In text formats that render "pretty" tables, + * is it better to align value right in table cell. + * Examples: numbers, even nullable. + */ + virtual bool shouldAlignRightInPrettyFormats() const { return false; } + + /** Does formatted value in any text format can contain anything but valid UTF8 sequences. + * Example: String (because it can contain arbitrary bytes). + * Counterexamples: numbers, Date, DateTime. + * For Enum, it depends. + */ + virtual bool textCanContainOnlyValidUTF8() const { return false; } + + /** Is it possible to compare for less/greater, to calculate min/max? + * Not necessarily totally comparable. For example, floats are comparable despite the fact that NaNs compares to nothing. + * The same for nullable of comparable types: they are comparable (but not totally-comparable). + */ + virtual bool isComparable() const { return false; } + + /** Does it make sense to use this type with COLLATE modifier in ORDER BY. + * Example: String, but not FixedString. + */ + virtual bool canBeComparedWithCollation() const { return false; } + + /** If the type is totally comparable (Ints, Date, DateTime, DateTime64, not nullable, not floats) + * and "simple" enough (not String, FixedString) to be used as version number + * (to select rows with maximum version). + */ + virtual bool canBeUsedAsVersion() const { return false; } + + /** Values of data type can be summed (possibly with overflow, within the same data type). + * Example: numbers, even nullable. Not Date/DateTime. Not Enum. + * Enums can be passed to aggregate function 'sum', but the result is Int64, not Enum, so they are not summable. + */ + virtual bool isSummable() const { return false; } + + /** Can be used in operations like bit and, bit shift, bit not, etc. + */ + virtual bool canBeUsedInBitOperations() const { return false; } + + /** Can be used in boolean context (WHERE, HAVING). + * UInt8, maybe nullable. + */ + virtual bool canBeUsedInBooleanContext() const { return false; } + + /** Numbers, Enums, Date, DateTime. Not nullable. + */ + virtual bool isValueRepresentedByNumber() const { return false; } + + /** Integers, Enums, Date, DateTime. Not nullable. + */ + virtual bool isValueRepresentedByInteger() const { return false; } + + /** Unsigned Integers, Date, DateTime. Not nullable. + */ + virtual bool isValueRepresentedByUnsignedInteger() const { return false; } + + /** Values are unambiguously identified by contents of contiguous memory region, + * that can be obtained by IColumn::getDataAt method. + * Examples: numbers, Date, DateTime, String, FixedString, + * and Arrays of numbers, Date, DateTime, FixedString, Enum, but not String. + * (because Array(String) values became ambiguous if you concatenate Strings). + * Counterexamples: Nullable, Tuple. + */ + virtual bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const { return false; } + + virtual bool isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion() const + { + return isValueRepresentedByNumber(); + } + + /** Example: numbers, Date, DateTime, FixedString, Enum... Nullable and Tuple of such types. + * Counterexamples: String, Array. + * It's Ok to return false for AggregateFunction despite the fact that some of them have fixed size state. + */ + virtual bool haveMaximumSizeOfValue() const { return false; } + + /** Size in amount of bytes in memory. Throws an exception if not haveMaximumSizeOfValue. + */ + virtual size_t getMaximumSizeOfValueInMemory() const { return getSizeOfValueInMemory(); } + + /** Throws an exception if value is not of fixed size. + */ + virtual size_t getSizeOfValueInMemory() const; + + /** Integers (not floats), Enum, String, FixedString. + */ + virtual bool isCategorial() const { return false; } + + virtual bool isNullable() const { return false; } + + /** Is this type can represent only NULL value? (It also implies isNullable) + */ + virtual bool onlyNull() const { return false; } + + /** If this data type cannot be wrapped in Nullable data type. + */ + virtual bool canBeInsideNullable() const { return false; } + + virtual bool lowCardinality() const { return false; } + + /// Checks if this type is LowCardinality(Nullable(...)) + virtual bool isLowCardinalityNullable() const { return false; } + + /// Strings, Numbers, Date, DateTime, Nullable + virtual bool canBeInsideLowCardinality() const { return false; } + + /// Object, Array(Object), Tuple(..., Object, ...) + virtual bool hasDynamicSubcolumns() const { return false; } + + /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. + static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); + +protected: + friend class DataTypeFactory; + friend class AggregateFunctionSimpleState; + + /// Customize this DataType + void setCustomization(DataTypeCustomDescPtr custom_desc_) const; + + /// This is mutable to allow setting custom name and serialization on `const IDataType` post construction. + mutable DataTypeCustomNamePtr custom_name; + mutable SerializationPtr custom_serialization; + +public: + const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } + const ISerialization * getCustomSerialization() const { return custom_serialization.get(); } + +private: + template <typename Ptr> + Ptr getForSubcolumn( + std::string_view subcolumn_name, + const SubstreamData & data, + Ptr SubstreamData::*member, + bool throw_if_null) const; +}; + + +/// Some sugar to check data type of IDataType +struct WhichDataType +{ + TypeIndex idx; + + constexpr WhichDataType(TypeIndex idx_ = TypeIndex::Nothing) : idx(idx_) {} /// NOLINT + constexpr WhichDataType(const IDataType & data_type) : idx(data_type.getTypeId()) {} /// NOLINT + constexpr WhichDataType(const IDataType * data_type) : idx(data_type->getTypeId()) {} /// NOLINT + + // shared ptr -> is non-constexpr in gcc + WhichDataType(const DataTypePtr & data_type) : idx(data_type->getTypeId()) {} /// NOLINT + + constexpr bool isUInt8() const { return idx == TypeIndex::UInt8; } + constexpr bool isUInt16() const { return idx == TypeIndex::UInt16; } + constexpr bool isUInt32() const { return idx == TypeIndex::UInt32; } + constexpr bool isUInt64() const { return idx == TypeIndex::UInt64; } + constexpr bool isUInt128() const { return idx == TypeIndex::UInt128; } + constexpr bool isUInt256() const { return idx == TypeIndex::UInt256; } + constexpr bool isNativeUInt() const { return isUInt8() || isUInt16() || isUInt32() || isUInt64(); } + constexpr bool isUInt() const { return isNativeUInt() || isUInt128() || isUInt256(); } + + constexpr bool isInt8() const { return idx == TypeIndex::Int8; } + constexpr bool isInt16() const { return idx == TypeIndex::Int16; } + constexpr bool isInt32() const { return idx == TypeIndex::Int32; } + constexpr bool isInt64() const { return idx == TypeIndex::Int64; } + constexpr bool isInt128() const { return idx == TypeIndex::Int128; } + constexpr bool isInt256() const { return idx == TypeIndex::Int256; } + constexpr bool isNativeInt() const { return isInt8() || isInt16() || isInt32() || isInt64(); } + constexpr bool isInt() const { return isNativeInt() || isInt128() || isInt256(); } + + constexpr bool isDecimal32() const { return idx == TypeIndex::Decimal32; } + constexpr bool isDecimal64() const { return idx == TypeIndex::Decimal64; } + constexpr bool isDecimal128() const { return idx == TypeIndex::Decimal128; } + constexpr bool isDecimal256() const { return idx == TypeIndex::Decimal256; } + constexpr bool isDecimal() const { return isDecimal32() || isDecimal64() || isDecimal128() || isDecimal256(); } + + constexpr bool isFloat32() const { return idx == TypeIndex::Float32; } + constexpr bool isFloat64() const { return idx == TypeIndex::Float64; } + constexpr bool isFloat() const { return isFloat32() || isFloat64(); } + + constexpr bool isEnum8() const { return idx == TypeIndex::Enum8; } + constexpr bool isEnum16() const { return idx == TypeIndex::Enum16; } + constexpr bool isEnum() const { return isEnum8() || isEnum16(); } + + constexpr bool isDate() const { return idx == TypeIndex::Date; } + constexpr bool isDate32() const { return idx == TypeIndex::Date32; } + constexpr bool isDateTime() const { return idx == TypeIndex::DateTime; } + constexpr bool isDateTime64() const { return idx == TypeIndex::DateTime64; } + constexpr bool isDateOrDate32() const { return isDate() || isDate32(); } + + constexpr bool isString() const { return idx == TypeIndex::String; } + constexpr bool isFixedString() const { return idx == TypeIndex::FixedString; } + constexpr bool isStringOrFixedString() const { return isString() || isFixedString(); } + + constexpr bool isUUID() const { return idx == TypeIndex::UUID; } + constexpr bool isIPv4() const { return idx == TypeIndex::IPv4; } + constexpr bool isIPv6() const { return idx == TypeIndex::IPv6; } + constexpr bool isArray() const { return idx == TypeIndex::Array; } + constexpr bool isTuple() const { return idx == TypeIndex::Tuple; } + constexpr bool isMap() const {return idx == TypeIndex::Map; } + constexpr bool isSet() const { return idx == TypeIndex::Set; } + constexpr bool isInterval() const { return idx == TypeIndex::Interval; } + constexpr bool isObject() const { return idx == TypeIndex::Object; } + + constexpr bool isNothing() const { return idx == TypeIndex::Nothing; } + constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } + constexpr bool isFunction() const { return idx == TypeIndex::Function; } + constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; } + constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); } + + constexpr bool isLowCardinality() const { return idx == TypeIndex::LowCardinality; } +}; + +/// IDataType helpers (alternative for IDataType virtual methods with single point of truth) + +template <typename T> +inline bool isDate(const T & data_type) { return WhichDataType(data_type).isDate(); } +template <typename T> +inline bool isDate32(const T & data_type) { return WhichDataType(data_type).isDate32(); } +template <typename T> +inline bool isDateOrDate32(const T & data_type) { return WhichDataType(data_type).isDateOrDate32(); } +template <typename T> +inline bool isDateTime(const T & data_type) { return WhichDataType(data_type).isDateTime(); } +template <typename T> +inline bool isDateTime64(const T & data_type) { return WhichDataType(data_type).isDateTime64(); } + +template <typename T> +inline bool isEnum(const T & data_type) { return WhichDataType(data_type).isEnum(); } +template <typename T> +inline bool isDecimal(const T & data_type) { return WhichDataType(data_type).isDecimal(); } +template <typename T> +inline bool isTuple(const T & data_type) { return WhichDataType(data_type).isTuple(); } +template <typename T> +inline bool isArray(const T & data_type) { return WhichDataType(data_type).isArray(); } +template <typename T> +inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); } +template <typename T> +inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); } +template <typename T> +inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); } +template <typename T> +inline bool isUUID(const T & data_type) { return WhichDataType(data_type).isUUID(); } +template <typename T> +inline bool isIPv4(const T & data_type) { return WhichDataType(data_type).isIPv4(); } +template <typename T> +inline bool isIPv6(const T & data_type) { return WhichDataType(data_type).isIPv6(); } + +template <typename T> +inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); +} + +template <typename T> +inline bool isUInt8(const T & data_type) +{ + return WhichDataType(data_type).isUInt8(); +} + +template <typename T> +inline bool isUInt64(const T & data_type) +{ + return WhichDataType(data_type).isUInt64(); +} + +template <typename T> +inline bool isUnsignedInteger(const T & data_type) +{ + return WhichDataType(data_type).isUInt(); +} + +template <typename T> +inline bool isInteger(const T & data_type) +{ + WhichDataType which(data_type); + return which.isInt() || which.isUInt(); +} + +template <typename T> +inline bool isFloat(const T & data_type) +{ + WhichDataType which(data_type); + return which.isFloat(); +} + +template <typename T> +inline bool isNativeInteger(const T & data_type) +{ + WhichDataType which(data_type); + return which.isNativeInt() || which.isNativeUInt(); +} + + +template <typename T> +inline bool isNativeNumber(const T & data_type) +{ + WhichDataType which(data_type); + return which.isNativeInt() || which.isNativeUInt() || which.isFloat(); +} + +template <typename T> +inline bool isNumber(const T & data_type) +{ + WhichDataType which(data_type); + return which.isInt() || which.isUInt() || which.isFloat() || which.isDecimal(); +} + +template <typename T> +inline bool isColumnedAsNumber(const T & data_type) +{ + WhichDataType which(data_type); + return which.isInt() || which.isUInt() || which.isFloat() || which.isDateOrDate32() || which.isDateTime() || which.isDateTime64() || which.isUUID() || which.isIPv4() || which.isIPv6(); +} + +template <typename T> +inline bool isColumnedAsDecimal(const T & data_type) +{ + WhichDataType which(data_type); + return which.isDecimal() || which.isDateTime64(); +} + +// Same as isColumnedAsDecimal but also checks value type of underlyig column. +template <typename T, typename DataType> +inline bool isColumnedAsDecimalT(const DataType & data_type) +{ + const WhichDataType which(data_type); + return (which.isDecimal() || which.isDateTime64()) && which.idx == TypeToTypeIndex<T>; +} + +template <typename T> +inline bool isString(const T & data_type) +{ + return WhichDataType(data_type).isString(); +} + +template <typename T> +inline bool isFixedString(const T & data_type) +{ + return WhichDataType(data_type).isFixedString(); +} + +template <typename T> +inline bool isStringOrFixedString(const T & data_type) +{ + return WhichDataType(data_type).isStringOrFixedString(); +} + +template <typename T> +inline bool isNotCreatable(const T & data_type) +{ + WhichDataType which(data_type); + return which.isNothing() || which.isFunction() || which.isSet(); +} + +inline bool isNotDecimalButComparableToDecimal(const DataTypePtr & data_type) +{ + WhichDataType which(data_type); + return which.isInt() || which.isUInt() || which.isFloat(); +} + +inline bool isBool(const DataTypePtr & data_type) +{ + return data_type->getName() == "Bool"; +} + +inline bool isAggregateFunction(const DataTypePtr & data_type) +{ + WhichDataType which(data_type); + return which.isAggregateFunction(); +} + +inline bool isNullableOrLowCardinalityNullable(const DataTypePtr & data_type) +{ + return data_type->isNullable() || data_type->isLowCardinalityNullable(); +} + +template <typename DataType> constexpr bool IsDataTypeDecimal = false; +template <typename DataType> constexpr bool IsDataTypeNumber = false; +template <typename DataType> constexpr bool IsDataTypeDateOrDateTime = false; +template <typename DataType> constexpr bool IsDataTypeDate = false; +template <typename DataType> constexpr bool IsDataTypeEnum = false; + +template <typename DataType> constexpr bool IsDataTypeDecimalOrNumber = IsDataTypeDecimal<DataType> || IsDataTypeNumber<DataType>; + +template <is_decimal T> +class DataTypeDecimal; + +template <typename T> +class DataTypeNumber; + +class DataTypeDate; +class DataTypeDate32; +class DataTypeDateTime; +class DataTypeDateTime64; + +template <is_decimal T> constexpr bool IsDataTypeDecimal<DataTypeDecimal<T>> = true; +template <> inline constexpr bool IsDataTypeDecimal<DataTypeDateTime64> = true; + +template <typename T> constexpr bool IsDataTypeNumber<DataTypeNumber<T>> = true; + +template <> inline constexpr bool IsDataTypeDate<DataTypeDate> = true; +template <> inline constexpr bool IsDataTypeDate<DataTypeDate32> = true; + +template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDate> = true; +template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDate32> = true; +template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime> = true; +template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime64> = true; + +template <typename T> +class DataTypeEnum; + +template <typename T> inline constexpr bool IsDataTypeEnum<DataTypeEnum<T>> = true; + +#define FOR_BASIC_NUMERIC_TYPES(M) \ + M(UInt8) \ + M(UInt16) \ + M(UInt32) \ + M(UInt64) \ + M(Int8) \ + M(Int16) \ + M(Int32) \ + M(Int64) \ + M(Float32) \ + M(Float64) + +#define FOR_NUMERIC_TYPES(M) \ + M(UInt8) \ + M(UInt16) \ + M(UInt32) \ + M(UInt64) \ + M(UInt128) \ + M(UInt256) \ + M(Int8) \ + M(Int16) \ + M(Int32) \ + M(Int64) \ + M(Int128) \ + M(Int256) \ + M(Float32) \ + M(Float64) +} + +/// See https://fmt.dev/latest/api.html#formatting-user-defined-types +template <> +struct fmt::formatter<DB::DataTypePtr> +{ + constexpr static auto parse(format_parse_context & ctx) + { + const auto * it = ctx.begin(); + const auto * end = ctx.end(); + + /// Only support {}. + if (it != end && *it != '}') + throw fmt::format_error("invalid format"); + + return it; + } + + template <typename FormatContext> + auto format(const DB::DataTypePtr & type, FormatContext & ctx) + { + return fmt::format_to(ctx.out(), "{}", type->getName()); + } +}; diff --git a/contrib/clickhouse/src/DataTypes/IDataTypeDummy.h b/contrib/clickhouse/src/DataTypes/IDataTypeDummy.h new file mode 100644 index 00000000000..fcfcbe43375 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/IDataTypeDummy.h @@ -0,0 +1,50 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <Core/Field.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +/** The base class for data types that do not support serialization and deserialization, + * but arise only as an intermediate result of the calculations. + * + * That is, this class is used just to distinguish the corresponding data type from the others. + */ +class IDataTypeDummy : public IDataType +{ +private: + [[noreturn]] void throwNoSerialization() const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Serialization is not implemented for data type {}", getName()); + } + +public: + MutableColumnPtr createColumn() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method createColumn() is not implemented for data type {}", getName()); + } + + Field getDefault() const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDefault() is not implemented for data type {}", getName()); + } + + void insertDefaultInto(IColumn &) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertDefaultInto() is not implemented for data type {}", getName()); + } + + bool haveSubtypes() const override { return false; } + bool cannotBeStoredInTables() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override { throwNoSerialization(); } +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Native.cpp b/contrib/clickhouse/src/DataTypes/Native.cpp new file mode 100644 index 00000000000..fd3716c2291 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Native.cpp @@ -0,0 +1,200 @@ +#include <DataTypes/Native.h> + +#if USE_EMBEDDED_COMPILER +# include <DataTypes/DataTypeNullable.h> +# include <Columns/ColumnConst.h> +# include <Columns/ColumnNullable.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; +} + +bool typeIsSigned(const IDataType & type) +{ + WhichDataType data_type(type); + return data_type.isNativeInt() || data_type.isFloat() || data_type.isEnum() || data_type.isDate32(); +} + +llvm::Type * toNullableType(llvm::IRBuilderBase & builder, llvm::Type * type) +{ + auto * is_null_type = builder.getInt1Ty(); + return llvm::StructType::get(type, is_null_type); +} + +bool canBeNativeType(const IDataType & type) +{ + WhichDataType data_type(type); + + if (data_type.isNullable()) + { + const auto & data_type_nullable = static_cast<const DataTypeNullable&>(type); + return canBeNativeType(*data_type_nullable.getNestedType()); + } + + return data_type.isNativeInt() || data_type.isNativeUInt() || data_type.isFloat() || data_type.isDate() + || data_type.isDate32() || data_type.isDateTime() || data_type.isEnum(); +} + +bool canBeNativeType(const DataTypePtr & type) +{ + return canBeNativeType(*type); +} + +llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const IDataType & type) +{ + WhichDataType data_type(type); + + if (data_type.isNullable()) + { + const auto & data_type_nullable = static_cast<const DataTypeNullable&>(type); + auto * nested_type = toNativeType(builder, *data_type_nullable.getNestedType()); + return toNullableType(builder, nested_type); + } + + /// LLVM doesn't have unsigned types, it has unsigned instructions. + if (data_type.isInt8() || data_type.isUInt8()) + return builder.getInt8Ty(); + else if (data_type.isInt16() || data_type.isUInt16() || data_type.isDate()) + return builder.getInt16Ty(); + else if (data_type.isInt32() || data_type.isUInt32() || data_type.isDate32() || data_type.isDateTime()) + return builder.getInt32Ty(); + else if (data_type.isInt64() || data_type.isUInt64()) + return builder.getInt64Ty(); + else if (data_type.isFloat32()) + return builder.getFloatTy(); + else if (data_type.isFloat64()) + return builder.getDoubleTy(); + else if (data_type.isEnum8()) + return builder.getInt8Ty(); + else if (data_type.isEnum16()) + return builder.getInt16Ty(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid cast to native type"); +} + +llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const DataTypePtr & type) +{ + return toNativeType(builder, *type); +} + +llvm::Value * nativeBoolCast(llvm::IRBuilderBase & b, const DataTypePtr & from_type, llvm::Value * value) +{ + if (from_type->isNullable()) + { + auto * inner = nativeBoolCast(b, removeNullable(from_type), b.CreateExtractValue(value, {0})); + return b.CreateAnd(b.CreateNot(b.CreateExtractValue(value, {1})), inner); + } + + auto * zero = llvm::Constant::getNullValue(value->getType()); + + if (value->getType()->isIntegerTy()) + return b.CreateICmpNE(value, zero); + else if (value->getType()->isFloatingPointTy()) + return b.CreateFCmpUNE(value, zero); + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot cast non-number {} to bool", from_type->getName()); +} + +llvm::Value * nativeBoolCast(llvm::IRBuilderBase & b, const ValueWithType & value_with_type) +{ + return nativeBoolCast(b, value_with_type.type, value_with_type.value); +} + +llvm::Value * nativeCast(llvm::IRBuilderBase & b, const DataTypePtr & from_type, llvm::Value * value, const DataTypePtr & to_type) +{ + if (from_type->equals(*to_type)) + { + return value; + } + else if (from_type->isNullable() && to_type->isNullable()) + { + auto * inner = nativeCast(b, removeNullable(from_type), b.CreateExtractValue(value, {0}), to_type); + return b.CreateInsertValue(inner, b.CreateExtractValue(value, {1}), {1}); + } + else if (from_type->isNullable()) + { + return nativeCast(b, removeNullable(from_type), b.CreateExtractValue(value, {0}), to_type); + } + else if (to_type->isNullable()) + { + auto * to_native_type = toNativeType(b, to_type); + auto * inner = nativeCast(b, from_type, value, removeNullable(to_type)); + return b.CreateInsertValue(llvm::Constant::getNullValue(to_native_type), inner, {0}); + } + else + { + auto * from_native_type = toNativeType(b, from_type); + auto * to_native_type = toNativeType(b, to_type); + + if (from_native_type == to_native_type) + return value; + else if (from_native_type->isIntegerTy() && to_native_type->isFloatingPointTy()) + return typeIsSigned(*from_type) ? b.CreateSIToFP(value, to_native_type) : b.CreateUIToFP(value, to_native_type); + else if (from_native_type->isFloatingPointTy() && to_native_type->isIntegerTy()) + return typeIsSigned(*to_type) ? b.CreateFPToSI(value, to_native_type) : b.CreateFPToUI(value, to_native_type); + else if (from_native_type->isIntegerTy() && from_native_type->isIntegerTy()) + return b.CreateIntCast(value, to_native_type, typeIsSigned(*from_type)); + else if (to_native_type->isFloatingPointTy() && to_native_type->isFloatingPointTy()) + return b.CreateFPCast(value, to_native_type); + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Invalid cast to native value from type {} to type {}", + from_type->getName(), + to_type->getName()); +} + +llvm::Value * nativeCast(llvm::IRBuilderBase & b, const ValueWithType & value, const DataTypePtr & to_type) +{ + return nativeCast(b, value.type, value.value, to_type); +} + +llvm::Constant * getColumnNativeValue(llvm::IRBuilderBase & builder, const DataTypePtr & column_type, const IColumn & column, size_t index) +{ + if (const auto * constant = typeid_cast<const ColumnConst *>(&column)) + return getColumnNativeValue(builder, column_type, constant->getDataColumn(), 0); + + auto * type = toNativeType(builder, column_type); + + WhichDataType column_data_type(column_type); + if (column_data_type.isNullable()) + { + const auto & nullable_data_type = assert_cast<const DataTypeNullable &>(*column_type); + const auto & nullable_column = assert_cast<const ColumnNullable &>(column); + + auto * value = getColumnNativeValue(builder, nullable_data_type.getNestedType(), nullable_column.getNestedColumn(), index); + auto * is_null = llvm::ConstantInt::get(type->getContainedType(1), nullable_column.isNullAt(index)); + + return llvm::ConstantStruct::get(static_cast<llvm::StructType *>(type), value, is_null); + } + else if (column_data_type.isFloat32()) + { + return llvm::ConstantFP::get(type, assert_cast<const ColumnVector<Float32> &>(column).getElement(index)); + } + else if (column_data_type.isFloat64()) + { + return llvm::ConstantFP::get(type, assert_cast<const ColumnVector<Float64> &>(column).getElement(index)); + } + else if (column_data_type.isNativeUInt() || column_data_type.isDate() || column_data_type.isDateTime()) + { + return llvm::ConstantInt::get(type, column.getUInt(index)); + } + else if (column_data_type.isNativeInt() || column_data_type.isEnum() || column_data_type.isDate32()) + { + return llvm::ConstantInt::get(type, column.getInt(index)); + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot get native value for column with type {}", + column_type->getName()); +} + +} + +#endif diff --git a/contrib/clickhouse/src/DataTypes/Native.h b/contrib/clickhouse/src/DataTypes/Native.h new file mode 100644 index 00000000000..875248103c5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Native.h @@ -0,0 +1,111 @@ +#pragma once + +#include "clickhouse_config.h" + +#if USE_EMBEDDED_COMPILER +# include <Common/Exception.h> +# include <Core/ValueWithType.h> +# include <DataTypes/IDataType.h> +# error #include <llvm/IR/IRBuilder.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +/// Returns true if type is signed, false otherwise +bool typeIsSigned(const IDataType & type); + +/// Cast LLVM type to nullable LLVM type +llvm::Type * toNullableType(llvm::IRBuilderBase & builder, llvm::Type * type); + +/// Returns true if type can be native LLVM type, false otherwise +bool canBeNativeType(const IDataType & type); + +/// Returns true if type can be native LLVM type, false otherwise +bool canBeNativeType(const DataTypePtr & type); + +template <typename Type> +static inline bool canBeNativeType() +{ + if constexpr (std::is_same_v<Type, Int8> || std::is_same_v<Type, UInt8>) + return true; + else if constexpr (std::is_same_v<Type, Int16> || std::is_same_v<Type, UInt16>) + return true; + else if constexpr (std::is_same_v<Type, Int32> || std::is_same_v<Type, UInt32>) + return true; + else if constexpr (std::is_same_v<Type, Int64> || std::is_same_v<Type, UInt64>) + return true; + else if constexpr (std::is_same_v<Type, Float32> || std::is_same_v<Type, Float64>) + return true; + + return false; +} + +/// Cast type to native LLVM type +llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const IDataType & type); + +/// Cast type to native LLVM type +llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const DataTypePtr & type); + +template <typename ToType> +static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder) +{ + if constexpr (std::is_same_v<ToType, Int8> || std::is_same_v<ToType, UInt8>) + return builder.getInt8Ty(); + else if constexpr (std::is_same_v<ToType, Int16> || std::is_same_v<ToType, UInt16>) + return builder.getInt16Ty(); + else if constexpr (std::is_same_v<ToType, Int32> || std::is_same_v<ToType, UInt32>) + return builder.getInt32Ty(); + else if constexpr (std::is_same_v<ToType, Int64> || std::is_same_v<ToType, UInt64>) + return builder.getInt64Ty(); + else if constexpr (std::is_same_v<ToType, Float32>) + return builder.getFloatTy(); + else if constexpr (std::is_same_v<ToType, Float64>) + return builder.getDoubleTy(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid cast to native type"); +} + +template <typename ToType> +static inline DataTypePtr toNativeDataType() +{ + if constexpr (std::is_same_v<ToType, Int8> || std::is_same_v<ToType, UInt8> || + std::is_same_v<ToType, Int16> || std::is_same_v<ToType, UInt16> || + std::is_same_v<ToType, Int32> || std::is_same_v<ToType, UInt32> || + std::is_same_v<ToType, Int64> || std::is_same_v<ToType, UInt64> || + std::is_same_v<ToType, Float32> || std::is_same_v<ToType, Float64>) + return std::make_shared<DataTypeNumber<ToType>>(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid cast to native data type"); +} + +/// Cast LLVM value with type to bool +llvm::Value * nativeBoolCast(llvm::IRBuilderBase & b, const DataTypePtr & from_type, llvm::Value * value); + +/// Cast LLVM value with type to bool +llvm::Value * nativeBoolCast(llvm::IRBuilderBase & b, const ValueWithType & value_with_type); + +/// Cast LLVM value with type to specified type +llvm::Value * nativeCast(llvm::IRBuilderBase & b, const DataTypePtr & from_type, llvm::Value * value, const DataTypePtr & to_type); + +/// Cast LLVM value with type to specified type +llvm::Value * nativeCast(llvm::IRBuilderBase & b, const ValueWithType & value, const DataTypePtr & to_type); + +template <typename FromType> +static inline llvm::Value * nativeCast(llvm::IRBuilderBase & b, llvm::Value * value, const DataTypePtr & to) +{ + auto native_data_type = toNativeDataType<FromType>(); + return nativeCast(b, native_data_type, value, to); +} + +/// Get column value for specified index as LLVM constant +llvm::Constant * getColumnNativeValue(llvm::IRBuilderBase & builder, const DataTypePtr & column_type, const IColumn & column, size_t index); + +} + +#endif diff --git a/contrib/clickhouse/src/DataTypes/NestedUtils.cpp b/contrib/clickhouse/src/DataTypes/NestedUtils.cpp new file mode 100644 index 00000000000..9ee803c4235 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/NestedUtils.cpp @@ -0,0 +1,360 @@ +#include <cstring> +#include <memory> + +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> +#include <Common/StringUtils/StringUtils.h> +#include "Columns/IColumn.h" + +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/NestedUtils.h> +#include <DataTypes/DataTypeNested.h> + +#include <Columns/ColumnArray.h> +#include <Columns/ColumnTuple.h> +#include <Columns/ColumnConst.h> + +#include <Parsers/IAST.h> + +#include <boost/algorithm/string/case_conv.hpp> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int SIZES_OF_ARRAYS_DONT_MATCH; +} + +namespace Nested +{ + +std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name) +{ + if (nested_table_name.empty()) + return nested_field_name; + + if (nested_field_name.empty()) + return nested_table_name; + + return nested_table_name + "." + nested_field_name; +} + + +/** Name can be treated as compound if it contains dot (.) in the middle. + */ +std::pair<std::string, std::string> splitName(const std::string & name, bool reverse) +{ + auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.')); + if (idx == std::string::npos || idx == 0 || idx + 1 == name.size()) + return {name, {}}; + + return {name.substr(0, idx), name.substr(idx + 1)}; +} + +std::pair<std::string_view, std::string_view> splitName(std::string_view name, bool reverse) +{ + auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.')); + if (idx == std::string::npos || idx == 0 || idx + 1 == name.size()) + return {name, {}}; + + return {name.substr(0, idx), name.substr(idx + 1)}; +} + + +std::string extractTableName(const std::string & nested_name) +{ + auto split = splitName(nested_name); + return split.first; +} + + +static Block flattenImpl(const Block & block, bool flatten_named_tuple) +{ + Block res; + + for (const auto & elem : block) + { + if (const DataTypeArray * type_arr = typeid_cast<const DataTypeArray *>(elem.type.get())) + { + const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(type_arr->getNestedType().get()); + if (type_tuple && type_tuple->haveExplicitNames()) + { + const DataTypes & element_types = type_tuple->getElements(); + const Strings & names = type_tuple->getElementNames(); + size_t tuple_size = element_types.size(); + + bool is_const = isColumnConst(*elem.column); + const ColumnArray * column_array; + if (is_const) + column_array = typeid_cast<const ColumnArray *>(&assert_cast<const ColumnConst &>(*elem.column).getDataColumn()); + else + column_array = typeid_cast<const ColumnArray *>(elem.column.get()); + + const ColumnPtr & column_offsets = column_array->getOffsetsPtr(); + + const ColumnTuple & column_tuple = typeid_cast<const ColumnTuple &>(column_array->getData()); + const auto & element_columns = column_tuple.getColumns(); + + for (size_t i = 0; i < tuple_size; ++i) + { + String nested_name = concatenateName(elem.name, names[i]); + ColumnPtr column_array_of_element = ColumnArray::create(element_columns[i], column_offsets); + + res.insert(ColumnWithTypeAndName( + is_const + ? ColumnConst::create(std::move(column_array_of_element), block.rows()) + : column_array_of_element, + std::make_shared<DataTypeArray>(element_types[i]), + nested_name)); + } + } + else + res.insert(elem); + } + else if (const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(elem.type.get()); type_tuple && flatten_named_tuple) + { + if (type_tuple->haveExplicitNames()) + { + const DataTypes & element_types = type_tuple->getElements(); + const Strings & names = type_tuple->getElementNames(); + const ColumnTuple * column_tuple; + if (isColumnConst(*elem.column)) + column_tuple = typeid_cast<const ColumnTuple *>(&assert_cast<const ColumnConst &>(*elem.column).getDataColumn()); + else + column_tuple = typeid_cast<const ColumnTuple *>(elem.column.get()); + size_t tuple_size = column_tuple->tupleSize(); + for (size_t i = 0; i < tuple_size; ++i) + { + const auto & element_column = column_tuple->getColumn(i); + String nested_name = concatenateName(elem.name, names[i]); + res.insert(ColumnWithTypeAndName(element_column.getPtr(), element_types[i], nested_name)); + } + } + else + res.insert(elem); + } + else + res.insert(elem); + } + + return res; +} + +Block flatten(const Block & block) +{ + return flattenImpl(block, true); +} + + +Block flattenArrayOfTuples(const Block & block) +{ + return flattenImpl(block, false); +} + +namespace +{ + +using NameToDataType = std::map<String, DataTypePtr>; + +NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types) +{ + std::unordered_map<String, NamesAndTypesList> nested; + for (const auto & name_type : names_and_types) + { + const DataTypeArray * type_arr = typeid_cast<const DataTypeArray *>(name_type.type.get()); + + /// Ignore true Nested type, but try to unite flatten arrays to Nested type. + if (!isNested(name_type.type) && type_arr) + { + auto split = splitName(name_type.name); + if (!split.second.empty()) + nested[split.first].emplace_back(split.second, type_arr->getNestedType()); + } + } + + std::map<String, DataTypePtr> nested_types; + + for (const auto & [name, elems] : nested) + nested_types.emplace(name, createNested(elems.getTypes(), elems.getNames())); + + return nested_types; +} + +} + +NamesAndTypesList collect(const NamesAndTypesList & names_and_types) +{ + NamesAndTypesList res; + auto nested_types = getSubcolumnsOfNested(names_and_types); + + for (const auto & name_type : names_and_types) + if (!isArray(name_type.type) || !nested_types.contains(splitName(name_type.name).first)) + res.push_back(name_type); + + for (const auto & name_type : nested_types) + res.emplace_back(name_type.first, name_type.second); + + return res; +} + +NamesAndTypesList convertToSubcolumns(const NamesAndTypesList & names_and_types) +{ + auto nested_types = getSubcolumnsOfNested(names_and_types); + auto res = names_and_types; + + for (auto & name_type : res) + { + if (!isArray(name_type.type)) + continue; + + auto split = splitName(name_type.name); + if (name_type.isSubcolumn() || split.second.empty()) + continue; + + auto it = nested_types.find(split.first); + if (it != nested_types.end()) + name_type = NameAndTypePair{split.first, split.second, it->second, it->second->getSubcolumnType(split.second)}; + } + + return res; +} + + +void validateArraySizes(const Block & block) +{ + /// Nested prefix -> position of first column in block. + std::map<std::string, size_t> nested; + + for (size_t i = 0, size = block.columns(); i < size; ++i) + { + const auto & elem = block.getByPosition(i); + + if (isArray(elem.type)) + { + if (!typeid_cast<const ColumnArray *>(elem.column.get())) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Column with Array type is not represented by ColumnArray column: {}", + elem.column->dumpStructure()); + + auto split = splitName(elem.name); + + /// Is it really a column of Nested data structure. + if (!split.second.empty()) + { + auto [it, inserted] = nested.emplace(split.first, i); + + /// It's not the first column of Nested data structure. + if (!inserted) + { + const ColumnArray & first_array_column = assert_cast<const ColumnArray &>(*block.getByPosition(it->second).column); + const ColumnArray & another_array_column = assert_cast<const ColumnArray &>(*elem.column); + + if (!first_array_column.hasEqualOffsets(another_array_column)) + throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, + "Elements '{}' and '{}' " + "of Nested data structure '{}' (Array columns) have different array sizes.", + block.getByPosition(it->second).name, elem.name, split.first); + } + } + } + } +} + + +std::unordered_set<String> getAllTableNames(const Block & block, bool to_lower_case) +{ + std::unordered_set<String> nested_table_names; + for (const auto & name : block.getNames()) + { + auto nested_table_name = Nested::extractTableName(name); + if (to_lower_case) + boost::to_lower(nested_table_name); + + if (!nested_table_name.empty()) + nested_table_names.insert(std::move(nested_table_name)); + } + return nested_table_names; +} + +Names getAllNestedColumnsForTable(const Block & block, const std::string & table_name) +{ + Names names; + for (const auto & name: block.getNames()) + { + if (extractTableName(name) == table_name) + names.push_back(name); + } + return names; +} + +} + +NestedColumnExtractHelper::NestedColumnExtractHelper(const Block & block_, bool case_insentive_) + : block(block_) + , case_insentive(case_insentive_) +{} + +std::optional<ColumnWithTypeAndName> NestedColumnExtractHelper::extractColumn(const String & column_name) +{ + if (block.has(column_name, case_insentive)) + return {block.getByName(column_name, case_insentive)}; + + auto nested_names = Nested::splitName(column_name); + if (case_insentive) + { + boost::to_lower(nested_names.first); + boost::to_lower(nested_names.second); + } + if (!block.has(nested_names.first, case_insentive)) + return {}; + + if (!nested_tables.contains(nested_names.first)) + { + ColumnsWithTypeAndName columns = {block.getByName(nested_names.first, case_insentive)}; + nested_tables[nested_names.first] = std::make_shared<Block>(Nested::flatten(columns)); + } + + return extractColumn(column_name, nested_names.first, nested_names.second); +} + +std::optional<ColumnWithTypeAndName> NestedColumnExtractHelper::extractColumn( + const String & original_column_name, const String & column_name_prefix, const String & column_name_suffix) +{ + auto table_iter = nested_tables.find(column_name_prefix); + if (table_iter == nested_tables.end()) + { + return {}; + } + + auto & nested_table = table_iter->second; + auto nested_names = Nested::splitName(column_name_suffix); + auto new_column_name_prefix = Nested::concatenateName(column_name_prefix, nested_names.first); + if (nested_names.second.empty()) + { + if (auto * column_ref = nested_table->findByName(new_column_name_prefix, case_insentive)) + { + ColumnWithTypeAndName column = *column_ref; + if (case_insentive) + column.name = original_column_name; + return {std::move(column)}; + } + else + { + return {}; + } + } + + if (!nested_table->has(new_column_name_prefix, case_insentive)) + { + return {}; + } + + ColumnsWithTypeAndName columns = {nested_table->getByName(new_column_name_prefix, case_insentive)}; + Block sub_block(columns); + nested_tables[new_column_name_prefix] = std::make_shared<Block>(Nested::flatten(sub_block)); + return extractColumn(original_column_name, new_column_name_prefix, nested_names.second); +} +} diff --git a/contrib/clickhouse/src/DataTypes/NestedUtils.h b/contrib/clickhouse/src/DataTypes/NestedUtils.h new file mode 100644 index 00000000000..e009ceb18fe --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/NestedUtils.h @@ -0,0 +1,61 @@ +#pragma once + +#include <Core/Block.h> +#include <Core/NamesAndTypes.h> + + +namespace DB +{ + +namespace Nested +{ + std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name); + + /// Splits name of compound identifier by first/last dot (depending on 'reverse' parameter). + std::pair<std::string, std::string> splitName(const std::string & name, bool reverse = false); + std::pair<std::string_view, std::string_view> splitName(std::string_view name, bool reverse = false); + + /// Returns the prefix of the name to the first '.'. Or the name is unchanged if there is no dot. + std::string extractTableName(const std::string & nested_name); + + /// Flat a column of nested type into columns + /// 1) For named tuples,t Tuple(x .., y ..., ...), replace it with t.x ..., t.y ... , ... + /// 2) For an Array with named Tuple element column, a Array(Tuple(x ..., y ..., ...)), replace it with multiple Array Columns, a.x ..., a.y ..., ... + Block flatten(const Block & block); + + /// Same as flatten but only for Array with named Tuple element column. + Block flattenArrayOfTuples(const Block & block); + + /// Collect Array columns in a form of `column_name.element_name` to single Array(Tuple(...)) column. + NamesAndTypesList collect(const NamesAndTypesList & names_and_types); + + /// Convert old-style nested (single arrays with same prefix, `n.a`, `n.b`...) to subcolumns of data type Nested. + NamesAndTypesList convertToSubcolumns(const NamesAndTypesList & names_and_types); + + /// Check that sizes of arrays - elements of nested data structures - are equal. + void validateArraySizes(const Block & block); + + /// Get all nested tables names from a block. + std::unordered_set<String> getAllTableNames(const Block & block, bool to_lower_case = false); + + /// Extract all column names that are nested for specifying table. + Names getAllNestedColumnsForTable(const Block & block, const std::string & table_name); +} + +/// Use this class to extract element columns from columns of nested type in a block, e.g. named Tuple. +/// It can extract a column from a multiple nested type column, e.g. named Tuple in named Tuple +/// Keeps some intermediate data to avoid rebuild them multi-times. +class NestedColumnExtractHelper +{ +public: + explicit NestedColumnExtractHelper(const Block & block_, bool case_insentive_); + std::optional<ColumnWithTypeAndName> extractColumn(const String & column_name); +private: + std::optional<ColumnWithTypeAndName> + extractColumn(const String & original_column_name, const String & column_name_prefix, const String & column_name_suffix); + const Block & block; + bool case_insentive; + std::map<String, BlockPtr> nested_tables; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/NumberTraits.h b/contrib/clickhouse/src/DataTypes/NumberTraits.h new file mode 100644 index 00000000000..cf283d3358c --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/NumberTraits.h @@ -0,0 +1,244 @@ +#pragma once + +#include <type_traits> + +#include <Core/Types.h> + + +namespace DB +{ + +/** Allows get the result type of the functions +, -, *, /, %, intDiv (integer division). + * The rules are different from those used in C++. + */ + +namespace NumberTraits +{ + +struct Error {}; + +constexpr size_t max(size_t x, size_t y) +{ + return x > y ? x : y; +} + +constexpr size_t min(size_t x, size_t y) +{ + return x < y ? x : y; +} + +/// @note There's no auto scale to larger big integer, only for integral ones. +/// It's cause of (U)Int64 backward compatibility and very big performance penalties. +constexpr size_t nextSize(size_t size) +{ + if (size < 8) + return size * 2; + return size; +} + +template <bool is_signed, bool is_floating, size_t size> +struct Construct +{ + using Type = Error; +}; + +template <> struct Construct<false, false, 1> { using Type = UInt8; }; +template <> struct Construct<false, false, 2> { using Type = UInt16; }; +template <> struct Construct<false, false, 4> { using Type = UInt32; }; +template <> struct Construct<false, false, 8> { using Type = UInt64; }; +template <> struct Construct<false, false, 16> { using Type = UInt128; }; +template <> struct Construct<false, false, 32> { using Type = UInt256; }; +template <> struct Construct<false, true, 1> { using Type = Float32; }; +template <> struct Construct<false, true, 2> { using Type = Float32; }; +template <> struct Construct<false, true, 4> { using Type = Float32; }; +template <> struct Construct<false, true, 8> { using Type = Float64; }; +template <> struct Construct<true, false, 1> { using Type = Int8; }; +template <> struct Construct<true, false, 2> { using Type = Int16; }; +template <> struct Construct<true, false, 4> { using Type = Int32; }; +template <> struct Construct<true, false, 8> { using Type = Int64; }; +template <> struct Construct<true, false, 16> { using Type = Int128; }; +template <> struct Construct<true, false, 32> { using Type = Int256; }; +template <> struct Construct<true, true, 1> { using Type = Float32; }; +template <> struct Construct<true, true, 2> { using Type = Float32; }; +template <> struct Construct<true, true, 4> { using Type = Float32; }; +template <> struct Construct<true, true, 8> { using Type = Float64; }; + + +/** The result of addition or multiplication is calculated according to the following rules: + * - if one of the arguments is floating-point, the result is a floating point, otherwise - the whole; + * - if one of the arguments is signed, the result is signed, otherwise it is unsigned; + * - the result contains more bits (not only meaningful) than the maximum in the arguments + * (for example, UInt8 + Int32 = Int64). + */ +template <typename A, typename B> struct ResultOfAdditionMultiplication +{ + using Type = typename Construct< + is_signed_v<A> || is_signed_v<B>, + std::is_floating_point_v<A> || std::is_floating_point_v<B>, + nextSize(max(sizeof(A), sizeof(B)))>::Type; +}; + +template <typename A, typename B> struct ResultOfSubtraction +{ + using Type = typename Construct< + true, + std::is_floating_point_v<A> || std::is_floating_point_v<B>, + nextSize(max(sizeof(A), sizeof(B)))>::Type; +}; + +/** When dividing, you always get a floating-point number. + */ +template <typename A, typename B> struct ResultOfFloatingPointDivision +{ + using Type = Float64; +}; + +/** For integer division, we get a number with the same number of bits as in divisible. + */ +template <typename A, typename B> struct ResultOfIntegerDivision +{ + using Type = typename Construct< + is_signed_v<A> || is_signed_v<B>, + false, + sizeof(A)>::Type; +}; + +/** Division with remainder you get a number with the same number of bits as in divisor, + * or larger in case of signed type. + */ +template <typename A, typename B> struct ResultOfModulo +{ + static constexpr bool result_is_signed = is_signed_v<A>; + /// If modulo of division can yield negative number, we need larger type to accommodate it. + /// Example: toInt32(-199) % toUInt8(200) will return -199 that does not fit in Int8, only in Int16. + static constexpr size_t size_of_result = result_is_signed ? nextSize(sizeof(B)) : sizeof(B); + using Type0 = typename Construct<result_is_signed, false, size_of_result>::Type; + using Type = std::conditional_t<std::is_floating_point_v<A> || std::is_floating_point_v<B>, Float64, Type0>; +}; + +template <typename A, typename B> struct ResultOfPositiveModulo +{ + /// function positive_modulo always return non-negative number. + static constexpr size_t size_of_result = sizeof(B); + using Type0 = typename Construct<false, false, size_of_result>::Type; + using Type = std::conditional_t<std::is_floating_point_v<A> || std::is_floating_point_v<B>, Float64, Type0>; +}; + + +template <typename A, typename B> struct ResultOfModuloLegacy +{ + using Type0 = typename Construct<is_signed_v<A> || is_signed_v<B>, false, sizeof(B)>::Type; + using Type = std::conditional_t<std::is_floating_point_v<A> || std::is_floating_point_v<B>, Float64, Type0>; +}; + +template <typename A> struct ResultOfNegate +{ + using Type = typename Construct< + true, + std::is_floating_point_v<A>, + is_signed_v<A> ? sizeof(A) : nextSize(sizeof(A))>::Type; +}; + +template <typename A> struct ResultOfAbs +{ + using Type = typename Construct< + false, + std::is_floating_point_v<A>, + sizeof(A)>::Type; +}; + +/** For bitwise operations, an integer is obtained with number of bits is equal to the maximum of the arguments. + */ +template <typename A, typename B> struct ResultOfBit +{ + using Type = typename Construct< + is_signed_v<A> || is_signed_v<B>, + false, + std::is_floating_point_v<A> || std::is_floating_point_v<B> ? 8 : max(sizeof(A), sizeof(B))>::Type; +}; + +template <typename A> struct ResultOfBitNot +{ + using Type = typename Construct< + is_signed_v<A>, + false, + sizeof(A)>::Type; +}; + + +/** Type casting for `if` function: + * UInt<x>, UInt<y> -> UInt<max(x,y)> + * Int<x>, Int<y> -> Int<max(x,y)> + * Float<x>, Float<y> -> Float<max(x, y)> + * UInt<x>, Int<y> -> Int<max(x*2, y)> + * Float<x>, [U]Int<y> -> Float<max(x, y*2)> + * Decimal<x>, Decimal<y> -> Decimal<max(x,y)> + * UUID, UUID -> UUID + * UInt64, Int<x> -> Error + * Float<x>, [U]Int64 -> Error + */ +template <typename A, typename B> +struct ResultOfIf +{ + static constexpr bool has_float = std::is_floating_point_v<A> || std::is_floating_point_v<B>; + static constexpr bool has_integer = is_integer<A> || is_integer<B>; + static constexpr bool has_signed = is_signed_v<A> || is_signed_v<B>; + static constexpr bool has_unsigned = !is_signed_v<A> || !is_signed_v<B>; + static constexpr bool has_big_int = is_big_int_v<A> || is_big_int_v<B>; + + static constexpr size_t max_size_of_unsigned_integer = max(is_signed_v<A> ? 0 : sizeof(A), is_signed_v<B> ? 0 : sizeof(B)); + static constexpr size_t max_size_of_signed_integer = max(is_signed_v<A> ? sizeof(A) : 0, is_signed_v<B> ? sizeof(B) : 0); + static constexpr size_t max_size_of_integer = max(is_integer<A> ? sizeof(A) : 0, is_integer<B> ? sizeof(B) : 0); + static constexpr size_t max_size_of_float = max(std::is_floating_point_v<A> ? sizeof(A) : 0, std::is_floating_point_v<B> ? sizeof(B) : 0); + + using ConstructedType = typename Construct<has_signed, has_float, + ((has_float && has_integer && max_size_of_integer >= max_size_of_float) + || (has_signed && has_unsigned && max_size_of_unsigned_integer >= max_size_of_signed_integer)) + ? max(sizeof(A), sizeof(B)) * 2 + : max(sizeof(A), sizeof(B))>::Type; + + using Type = + std::conditional_t<std::is_same_v<A, B>, A, + std::conditional_t<is_decimal<A> && is_decimal<B>, + std::conditional_t<(sizeof(A) > sizeof(B)), A, B>, + std::conditional_t<!is_decimal<A> && !is_decimal<B>, + ConstructedType, Error>>>; +}; + +/** Before applying operator `%` and bitwise operations, operands are casted to whole numbers. */ +template <typename A> struct ToInteger +{ + using Type = typename Construct< + is_signed_v<A>, + false, + std::is_floating_point_v<A> ? 8 : sizeof(A)>::Type; +}; + + +// CLICKHOUSE-29. The same depth, different signs +// NOTE: This case is applied for 64-bit integers only (for backward compatibility), but could be used for any-bit integers +template <typename A, typename B> +constexpr bool LeastGreatestSpecialCase = + std::is_integral_v<A> && std::is_integral_v<B> + && (8 == sizeof(A) && sizeof(A) == sizeof(B)) + && (is_signed_v<A> ^ is_signed_v<B>); + +template <typename A, typename B> +using ResultOfLeast = std::conditional_t<LeastGreatestSpecialCase<A, B>, + typename Construct<true, false, sizeof(A)>::Type, + typename ResultOfIf<A, B>::Type>; + +template <typename A, typename B> +using ResultOfGreatest = std::conditional_t<LeastGreatestSpecialCase<A, B>, + typename Construct<false, false, sizeof(A)>::Type, + typename ResultOfIf<A, B>::Type>; + +} + +template <typename T> +static inline auto littleBits(const T & x) +{ + return static_cast<UInt8>(x); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/ObjectUtils.cpp b/contrib/clickhouse/src/DataTypes/ObjectUtils.cpp new file mode 100644 index 00000000000..28f000b6f0d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/ObjectUtils.cpp @@ -0,0 +1,992 @@ +#include <DataTypes/ObjectUtils.h> +#include <DataTypes/DataTypeObject.h> +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/DataTypeNested.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/getLeastSupertype.h> +#include <DataTypes/NestedUtils.h> +#include <Storages/StorageSnapshot.h> +#include <Columns/ColumnObject.h> +#include <Columns/ColumnTuple.h> +#include <Columns/ColumnArray.h> +#include <Columns/ColumnMap.h> +#include <Columns/ColumnNullable.h> +#include <Parsers/ASTSelectQuery.h> +#include <Parsers/ASTExpressionList.h> +#include <Parsers/ASTLiteral.h> +#include <Parsers/ASTFunction.h> +#include <IO/Operators.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; + extern const int INCOMPATIBLE_COLUMNS; + extern const int NOT_IMPLEMENTED; +} + +size_t getNumberOfDimensions(const IDataType & type) +{ + if (const auto * type_array = typeid_cast<const DataTypeArray *>(&type)) + return type_array->getNumberOfDimensions(); + return 0; +} + +size_t getNumberOfDimensions(const IColumn & column) +{ + if (const auto * column_array = checkAndGetColumn<ColumnArray>(column)) + return column_array->getNumberOfDimensions(); + return 0; +} + +DataTypePtr getBaseTypeOfArray(const DataTypePtr & type) +{ + /// Get raw pointers to avoid extra copying of type pointers. + const DataTypeArray * last_array = nullptr; + const auto * current_type = type.get(); + while (const auto * type_array = typeid_cast<const DataTypeArray *>(current_type)) + { + current_type = type_array->getNestedType().get(); + last_array = type_array; + } + + return last_array ? last_array->getNestedType() : type; +} + +ColumnPtr getBaseColumnOfArray(const ColumnPtr & column) +{ + /// Get raw pointers to avoid extra copying of column pointers. + const ColumnArray * last_array = nullptr; + const auto * current_column = column.get(); + while (const auto * column_array = checkAndGetColumn<ColumnArray>(current_column)) + { + current_column = &column_array->getData(); + last_array = column_array; + } + + return last_array ? last_array->getDataPtr() : column; +} + +DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions) +{ + for (size_t i = 0; i < num_dimensions; ++i) + type = std::make_shared<DataTypeArray>(std::move(type)); + return type; +} + +ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions) +{ + for (size_t i = 0; i < num_dimensions; ++i) + column = ColumnArray::create(column); + return column; +} + +Array createEmptyArrayField(size_t num_dimensions) +{ + if (num_dimensions == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions"); + + Array array; + Array * current_array = &array; + for (size_t i = 1; i < num_dimensions; ++i) + { + current_array->push_back(Array()); + current_array = ¤t_array->back().get<Array &>(); + } + + return array; +} + +DataTypePtr getDataTypeByColumn(const IColumn & column) +{ + auto idx = column.getDataType(); + WhichDataType which(idx); + if (which.isSimple()) + return DataTypeFactory::instance().get(String(magic_enum::enum_name(idx))); + + if (which.isNothing()) + return std::make_shared<DataTypeNothing>(); + + if (const auto * column_array = checkAndGetColumn<ColumnArray>(&column)) + return std::make_shared<DataTypeArray>(getDataTypeByColumn(column_array->getData())); + + if (const auto * column_nullable = checkAndGetColumn<ColumnNullable>(&column)) + return makeNullable(getDataTypeByColumn(column_nullable->getNestedColumn())); + + /// TODO: add more types. + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot get data type of column {}", column.getFamilyName()); +} + +template <size_t I, typename Tuple> +static auto extractVector(const std::vector<Tuple> & vec) +{ + static_assert(I < std::tuple_size_v<Tuple>); + std::vector<std::tuple_element_t<I, Tuple>> res; + res.reserve(vec.size()); + for (const auto & elem : vec) + res.emplace_back(std::get<I>(elem)); + return res; +} + +static DataTypePtr recreateTupleWithElements(const DataTypeTuple & type_tuple, const DataTypes & elements) +{ + return type_tuple.haveExplicitNames() + ? std::make_shared<DataTypeTuple>(elements, type_tuple.getElementNames()) + : std::make_shared<DataTypeTuple>(elements); +} + +static std::pair<ColumnPtr, DataTypePtr> convertObjectColumnToTuple( + const ColumnObject & column_object, const DataTypeObject & type_object) +{ + if (!column_object.isFinalized()) + { + auto finalized = column_object.cloneFinalized(); + const auto & finalized_object = assert_cast<const ColumnObject &>(*finalized); + return convertObjectColumnToTuple(finalized_object, type_object); + } + + const auto & subcolumns = column_object.getSubcolumns(); + + PathsInData tuple_paths; + DataTypes tuple_types; + Columns tuple_columns; + + for (const auto & entry : subcolumns) + { + tuple_paths.emplace_back(entry->path); + tuple_types.emplace_back(entry->data.getLeastCommonType()); + tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr()); + } + + return unflattenTuple(tuple_paths, tuple_types, tuple_columns); +} + +static std::pair<ColumnPtr, DataTypePtr> recursivlyConvertDynamicColumnToTuple( + const ColumnPtr & column, const DataTypePtr & type) +{ + if (!type->hasDynamicSubcolumns()) + return {column, type}; + + if (const auto * type_object = typeid_cast<const DataTypeObject *>(type.get())) + { + const auto & column_object = assert_cast<const ColumnObject &>(*column); + return convertObjectColumnToTuple(column_object, *type_object); + } + + if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get())) + { + const auto & column_array = assert_cast<const ColumnArray &>(*column); + auto [new_column, new_type] = recursivlyConvertDynamicColumnToTuple( + column_array.getDataPtr(), type_array->getNestedType()); + + return + { + ColumnArray::create(new_column, column_array.getOffsetsPtr()), + std::make_shared<DataTypeArray>(std::move(new_type)), + }; + } + + if (const auto * type_map = typeid_cast<const DataTypeMap *>(type.get())) + { + const auto & column_map = assert_cast<const ColumnMap &>(*column); + auto [new_column, new_type] = recursivlyConvertDynamicColumnToTuple( + column_map.getNestedColumnPtr(), type_map->getNestedType()); + + return + { + ColumnMap::create(new_column), + std::make_shared<DataTypeMap>(std::move(new_type)), + }; + } + + if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get())) + { + const auto & tuple_columns = assert_cast<const ColumnTuple &>(*column).getColumns(); + const auto & tuple_types = type_tuple->getElements(); + + assert(tuple_columns.size() == tuple_types.size()); + const size_t tuple_size = tuple_types.size(); + + Columns new_tuple_columns(tuple_size); + DataTypes new_tuple_types(tuple_size); + + for (size_t i = 0; i < tuple_size; ++i) + { + std::tie(new_tuple_columns[i], new_tuple_types[i]) + = recursivlyConvertDynamicColumnToTuple(tuple_columns[i], tuple_types[i]); + } + + return + { + ColumnTuple::create(new_tuple_columns), + recreateTupleWithElements(*type_tuple, new_tuple_types) + }; + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type->getName()); +} + +void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & storage_snapshot) +{ + for (auto & column : block) + { + if (!column.type->hasDynamicSubcolumns()) + continue; + + std::tie(column.column, column.type) + = recursivlyConvertDynamicColumnToTuple(column.column, column.type); + + GetColumnsOptions options(GetColumnsOptions::AllPhysical); + auto storage_column = storage_snapshot->tryGetColumn(options, column.name); + if (!storage_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", column.name); + + auto storage_column_concrete = storage_snapshot->getColumn(options.withExtendedObjects(), column.name); + + /// Check that constructed Tuple type and type in storage are compatible. + getLeastCommonTypeForDynamicColumns( + storage_column->type, {column.type, storage_column_concrete.type}, true); + } +} + +static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts) +{ + if (prefix.size() > parts.size()) + return false; + + for (size_t i = 0; i < prefix.size(); ++i) + if (prefix[i].key != parts[i].key) + return false; + return true; +} + +/// Returns true if there exists a prefix with matched names, +/// but not matched structure (is Nested, number of dimensions). +static bool hasDifferentStructureInPrefix(const PathInData::Parts & lhs, const PathInData::Parts & rhs) +{ + for (size_t i = 0; i < std::min(lhs.size(), rhs.size()); ++i) + { + if (lhs[i].key != rhs[i].key) + return false; + else if (lhs[i] != rhs[i]) + return true; + } + return false; +} + +void checkObjectHasNoAmbiguosPaths(const PathsInData & paths) +{ + size_t size = paths.size(); + for (size_t i = 0; i < size; ++i) + { + for (size_t j = 0; j < i; ++j) + { + if (isPrefix(paths[i].getParts(), paths[j].getParts()) + || isPrefix(paths[j].getParts(), paths[i].getParts())) + throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, + "Data in Object has ambiguous paths: '{}' and '{}'", + paths[i].getPath(), paths[j].getPath()); + + if (hasDifferentStructureInPrefix(paths[i].getParts(), paths[j].getParts())) + throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, + "Data in Object has ambiguous paths: '{}' and '{}'. " + "Paths have prefixes matched by names, but different in structure", + paths[i].getPath(), paths[j].getPath()); + } + } +} + +static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths) +{ + /// Types of subcolumns by path from all tuples. + std::unordered_map<PathInData, DataTypes, PathInData::Hash> subcolumns_types; + + /// First we flatten tuples, then get common type for paths + /// and finally unflatten paths and create new tuple type. + for (const auto & type : types) + { + const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()); + if (!type_tuple) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Least common type for object can be deduced only from tuples, but {} given", type->getName()); + + auto [tuple_paths, tuple_types] = flattenTuple(type); + assert(tuple_paths.size() == tuple_types.size()); + + for (size_t i = 0; i < tuple_paths.size(); ++i) + subcolumns_types[tuple_paths[i]].push_back(tuple_types[i]); + } + + PathsInData tuple_paths; + DataTypes tuple_types; + + /// Get the least common type for all paths. + for (const auto & [key, subtypes] : subcolumns_types) + { + assert(!subtypes.empty()); + if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY) + continue; + + size_t first_dim = getNumberOfDimensions(*subtypes[0]); + for (size_t i = 1; i < subtypes.size(); ++i) + if (first_dim != getNumberOfDimensions(*subtypes[i])) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Uncompatible types of subcolumn '{}': {} and {}", + key.getPath(), subtypes[0]->getName(), subtypes[i]->getName()); + + tuple_paths.emplace_back(key); + tuple_types.emplace_back(getLeastSupertypeOrString(subtypes)); + } + + if (tuple_paths.empty()) + { + tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY); + tuple_types.emplace_back(std::make_shared<DataTypeUInt8>()); + } + + if (check_ambiguos_paths) + checkObjectHasNoAmbiguosPaths(tuple_paths); + + return unflattenTuple(tuple_paths, tuple_types); +} + +static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl( + const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths); + +template<typename Type> +static DataTypePtr getLeastCommonTypeForColumnWithNestedType( + const Type & type, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + DataTypes nested_types; + nested_types.reserve(concrete_types.size()); + + for (const auto & concrete_type : concrete_types) + { + const auto * type_with_nested_conctete = typeid_cast<const Type *>(concrete_type.get()); + if (!type_with_nested_conctete) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected {} type, got {}", demangle(typeid(Type).name()), concrete_type->getName()); + + nested_types.push_back(type_with_nested_conctete->getNestedType()); + } + + return std::make_shared<Type>( + getLeastCommonTypeForDynamicColumnsImpl( + type.getNestedType(), nested_types, check_ambiguos_paths)); +} + +static DataTypePtr getLeastCommonTypeForTuple( + const DataTypeTuple & type, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + const auto & element_types = type.getElements(); + DataTypes new_element_types(element_types.size()); + + for (size_t i = 0; i < element_types.size(); ++i) + { + DataTypes concrete_element_types; + concrete_element_types.reserve(concrete_types.size()); + + for (const auto & type_concrete : concrete_types) + { + const auto * type_tuple_conctete = typeid_cast<const DataTypeTuple *>(type_concrete.get()); + if (!type_tuple_conctete) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected Tuple type, got {}", type_concrete->getName()); + + concrete_element_types.push_back(type_tuple_conctete->getElement(i)); + } + + new_element_types[i] = getLeastCommonTypeForDynamicColumnsImpl( + element_types[i], concrete_element_types, check_ambiguos_paths); + } + + return recreateTupleWithElements(type, new_element_types); +} + +static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl( + const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + if (!type_in_storage->hasDynamicSubcolumns()) + return type_in_storage; + + if (isObject(type_in_storage)) + return getLeastCommonTypeForObject(concrete_types, check_ambiguos_paths); + + if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get())) + return getLeastCommonTypeForColumnWithNestedType(*type_array, concrete_types, check_ambiguos_paths); + + if (const auto * type_map = typeid_cast<const DataTypeMap *>(type_in_storage.get())) + return getLeastCommonTypeForColumnWithNestedType(*type_map, concrete_types, check_ambiguos_paths); + + if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type_in_storage.get())) + return getLeastCommonTypeForTuple(*type_tuple, concrete_types, check_ambiguos_paths); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName()); +} + +DataTypePtr getLeastCommonTypeForDynamicColumns( + const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths) +{ + if (concrete_types.empty()) + return nullptr; + + bool all_equal = true; + for (size_t i = 1; i < concrete_types.size(); ++i) + { + if (!concrete_types[i]->equals(*concrete_types[0])) + { + all_equal = false; + break; + } + } + + if (all_equal) + return concrete_types[0]; + + return getLeastCommonTypeForDynamicColumnsImpl(type_in_storage, concrete_types, check_ambiguos_paths); +} + +DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage) +{ + if (!type_in_storage->hasDynamicSubcolumns()) + return type_in_storage; + + if (isObject(type_in_storage)) + return std::make_shared<DataTypeTuple>( + DataTypes{std::make_shared<DataTypeUInt8>()}, Names{ColumnObject::COLUMN_NAME_DUMMY}); + + if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get())) + return std::make_shared<DataTypeArray>( + createConcreteEmptyDynamicColumn(type_array->getNestedType())); + + if (const auto * type_map = typeid_cast<const DataTypeMap *>(type_in_storage.get())) + return std::make_shared<DataTypeMap>( + createConcreteEmptyDynamicColumn(type_map->getNestedType())); + + if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type_in_storage.get())) + { + const auto & elements = type_tuple->getElements(); + DataTypes new_elements; + new_elements.reserve(elements.size()); + + for (const auto & element : elements) + new_elements.push_back(createConcreteEmptyDynamicColumn(element)); + + return recreateTupleWithElements(*type_tuple, new_elements); + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName()); +} + +bool hasDynamicSubcolumns(const ColumnsDescription & columns) +{ + return std::any_of(columns.begin(), columns.end(), + [](const auto & column) + { + return column.type->hasDynamicSubcolumns(); + }); +} + +void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns) +{ + NamesAndTypesList subcolumns_list; + for (auto & column : columns_list) + { + auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, column.name); + if (object_column) + { + column.type = object_column->type; + + if (with_subcolumns) + subcolumns_list.splice(subcolumns_list.end(), object_columns.getSubcolumns(column.name)); + } + } + + columns_list.splice(columns_list.end(), std::move(subcolumns_list)); +} + +void updateObjectColumns( + ColumnsDescription & object_columns, + const ColumnsDescription & storage_columns, + const NamesAndTypesList & new_columns) +{ + for (const auto & new_column : new_columns) + { + auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name); + if (object_column && !object_column->type->equals(*new_column.type)) + { + auto storage_column = storage_columns.getColumn(GetColumnsOptions::All, new_column.name); + object_columns.modify(new_column.name, [&](auto & column) + { + column.type = getLeastCommonTypeForDynamicColumns(storage_column.type, {object_column->type, new_column.type}); + }); + } + } +} + +namespace +{ + +void flattenTupleImpl( + PathInDataBuilder & builder, + DataTypePtr type, + std::vector<PathInData::Parts> & new_paths, + DataTypes & new_types) +{ + if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get())) + { + const auto & tuple_names = type_tuple->getElementNames(); + const auto & tuple_types = type_tuple->getElements(); + + for (size_t i = 0; i < tuple_names.size(); ++i) + { + builder.append(tuple_names[i], false); + flattenTupleImpl(builder, tuple_types[i], new_paths, new_types); + builder.popBack(); + } + } + else if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get())) + { + PathInDataBuilder element_builder; + std::vector<PathInData::Parts> element_paths; + DataTypes element_types; + + flattenTupleImpl(element_builder, type_array->getNestedType(), element_paths, element_types); + assert(element_paths.size() == element_types.size()); + + for (size_t i = 0; i < element_paths.size(); ++i) + { + builder.append(element_paths[i], true); + new_paths.emplace_back(builder.getParts()); + new_types.emplace_back(std::make_shared<DataTypeArray>(element_types[i])); + builder.popBack(element_paths[i].size()); + } + } + else + { + new_paths.emplace_back(builder.getParts()); + new_types.emplace_back(type); + } +} + +/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns. +void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns) +{ + if (const auto * column_tuple = checkAndGetColumn<ColumnTuple>(column.get())) + { + const auto & subcolumns = column_tuple->getColumns(); + for (const auto & subcolumn : subcolumns) + flattenTupleImpl(subcolumn, new_columns, offsets_columns); + } + else if (const auto * column_array = checkAndGetColumn<ColumnArray>(column.get())) + { + offsets_columns.push_back(column_array->getOffsetsPtr()); + flattenTupleImpl(column_array->getDataPtr(), new_columns, offsets_columns); + offsets_columns.pop_back(); + } + else + { + if (!offsets_columns.empty()) + { + auto new_column = ColumnArray::create(column, offsets_columns.back()); + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) + new_column = ColumnArray::create(new_column, *it); + + new_columns.push_back(std::move(new_column)); + } + else + { + new_columns.push_back(column); + } + } +} + +DataTypePtr reduceNumberOfDimensions(DataTypePtr type, size_t dimensions_to_reduce) +{ + while (dimensions_to_reduce--) + { + const auto * type_array = typeid_cast<const DataTypeArray *>(type.get()); + if (!type_array) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce"); + + type = type_array->getNestedType(); + } + + return type; +} + +ColumnPtr reduceNumberOfDimensions(ColumnPtr column, size_t dimensions_to_reduce) +{ + while (dimensions_to_reduce--) + { + const auto * column_array = typeid_cast<const ColumnArray *>(column.get()); + if (!column_array) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce"); + + column = column_array->getDataPtr(); + } + + return column; +} + +/// We save intermediate column, type and number of array +/// dimensions for each intermediate node in path in subcolumns tree. +struct ColumnWithTypeAndDimensions +{ + ColumnPtr column; + DataTypePtr type; + size_t array_dimensions; +}; + +using SubcolumnsTreeWithColumns = SubcolumnsTree<ColumnWithTypeAndDimensions>; +using Node = SubcolumnsTreeWithColumns::Node; + +/// Creates data type and column from tree of subcolumns. +ColumnWithTypeAndDimensions createTypeFromNode(const Node & node) +{ + auto collect_tuple_elemets = [](const auto & children) + { + if (children.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create type from empty Tuple or Nested node"); + + std::vector<std::tuple<String, ColumnWithTypeAndDimensions>> tuple_elements; + tuple_elements.reserve(children.size()); + for (const auto & [name, child] : children) + { + assert(child); + auto column = createTypeFromNode(*child); + tuple_elements.emplace_back(name, std::move(column)); + } + + /// Sort to always create the same type for the same set of subcolumns. + ::sort(tuple_elements.begin(), tuple_elements.end(), + [](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); }); + + auto tuple_names = extractVector<0>(tuple_elements); + auto tuple_columns = extractVector<1>(tuple_elements); + + return std::make_tuple(std::move(tuple_names), std::move(tuple_columns)); + }; + + if (node.kind == Node::SCALAR) + { + return node.data; + } + else if (node.kind == Node::NESTED) + { + auto [tuple_names, tuple_columns] = collect_tuple_elemets(node.children); + + Columns offsets_columns; + offsets_columns.reserve(tuple_columns[0].array_dimensions + 1); + + /// If we have a Nested node and child node with anonymous array levels + /// we need to push a Nested type through all array levels. + /// Example: { "k1": [[{"k2": 1, "k3": 2}] } should be parsed as + /// `k1 Array(Nested(k2 Int, k3 Int))` and k1 is marked as Nested + /// and `k2` and `k3` has anonymous_array_level = 1 in that case. + + const auto & current_array = assert_cast<const ColumnArray &>(*node.data.column); + offsets_columns.push_back(current_array.getOffsetsPtr()); + + auto first_column = tuple_columns[0].column; + for (size_t i = 0; i < tuple_columns[0].array_dimensions; ++i) + { + const auto & column_array = assert_cast<const ColumnArray &>(*first_column); + offsets_columns.push_back(column_array.getOffsetsPtr()); + first_column = column_array.getDataPtr(); + } + + size_t num_elements = tuple_columns.size(); + Columns tuple_elements_columns(num_elements); + DataTypes tuple_elements_types(num_elements); + + /// Reduce extra array dimensions to get columns and types of Nested elements. + for (size_t i = 0; i < num_elements; ++i) + { + assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions); + tuple_elements_columns[i] = reduceNumberOfDimensions(tuple_columns[i].column, tuple_columns[i].array_dimensions); + tuple_elements_types[i] = reduceNumberOfDimensions(tuple_columns[i].type, tuple_columns[i].array_dimensions); + } + + auto result_column = ColumnArray::create(ColumnTuple::create(tuple_elements_columns), offsets_columns.back()); + auto result_type = createNested(tuple_elements_types, tuple_names); + + /// Recreate result Array type and Array column. + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) + { + result_column = ColumnArray::create(result_column, *it); + result_type = std::make_shared<DataTypeArray>(result_type); + } + + return {result_column, result_type, tuple_columns[0].array_dimensions}; + } + else + { + auto [tuple_names, tuple_columns] = collect_tuple_elemets(node.children); + + size_t num_elements = tuple_columns.size(); + Columns tuple_elements_columns(num_elements); + DataTypes tuple_elements_types(num_elements); + + for (size_t i = 0; i < tuple_columns.size(); ++i) + { + assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions); + tuple_elements_columns[i] = tuple_columns[i].column; + tuple_elements_types[i] = tuple_columns[i].type; + } + + auto result_column = ColumnTuple::create(tuple_elements_columns); + auto result_type = std::make_shared<DataTypeTuple>(tuple_elements_types, tuple_names); + + return {result_column, result_type, tuple_columns[0].array_dimensions}; + } +} + +} + +std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type) +{ + std::vector<PathInData::Parts> new_path_parts; + DataTypes new_types; + PathInDataBuilder builder; + + flattenTupleImpl(builder, type, new_path_parts, new_types); + + PathsInData new_paths(new_path_parts.begin(), new_path_parts.end()); + return {new_paths, new_types}; +} + +ColumnPtr flattenTuple(const ColumnPtr & column) +{ + Columns new_columns; + Columns offsets_columns; + + flattenTupleImpl(column, new_columns, offsets_columns); + return ColumnTuple::create(new_columns); +} + +DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_types) +{ + assert(paths.size() == tuple_types.size()); + Columns tuple_columns; + tuple_columns.reserve(tuple_types.size()); + for (const auto & type : tuple_types) + tuple_columns.emplace_back(type->createColumn()); + + return unflattenTuple(paths, tuple_types, tuple_columns).second; +} + +std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & column) +{ + const auto & subcolumns = column.getSubcolumns(); + + if (subcolumns.empty()) + { + auto type = std::make_shared<DataTypeTuple>( + DataTypes{std::make_shared<DataTypeUInt8>()}, + Names{ColumnObject::COLUMN_NAME_DUMMY}); + + return {type->createColumn()->cloneResized(column.size()), type}; + } + + PathsInData paths; + DataTypes types; + Columns columns; + + paths.reserve(subcolumns.size()); + types.reserve(subcolumns.size()); + columns.reserve(subcolumns.size()); + + for (const auto & entry : subcolumns) + { + paths.emplace_back(entry->path); + types.emplace_back(entry->data.getLeastCommonType()); + columns.emplace_back(entry->data.getFinalizedColumnPtr()); + } + + return unflattenTuple(paths, types, columns); +} + +std::pair<ColumnPtr, DataTypePtr> unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types, + const Columns & tuple_columns) +{ + assert(paths.size() == tuple_types.size()); + assert(paths.size() == tuple_columns.size()); + + if (paths.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot unflatten empty Tuple"); + + /// We add all paths to the subcolumn tree and then create a type from it. + /// The tree stores column, type and number of array dimensions + /// for each intermediate node. + SubcolumnsTreeWithColumns tree; + + for (size_t i = 0; i < paths.size(); ++i) + { + auto column = tuple_columns[i]; + auto type = tuple_types[i]; + + const auto & parts = paths[i].getParts(); + size_t num_parts = parts.size(); + + size_t pos = 0; + tree.add(paths[i], [&](Node::Kind kind, bool exists) -> std::shared_ptr<Node> + { + if (pos >= num_parts) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Not enough name parts for path {}. Expected at least {}, got {}", + paths[i].getPath(), pos + 1, num_parts); + + size_t array_dimensions = kind == Node::NESTED ? 1 : parts[pos].anonymous_array_level; + ColumnWithTypeAndDimensions current_column{column, type, array_dimensions}; + + /// Get type and column for next node. + if (array_dimensions) + { + type = reduceNumberOfDimensions(type, array_dimensions); + column = reduceNumberOfDimensions(column, array_dimensions); + } + + ++pos; + if (exists) + return nullptr; + + return kind == Node::SCALAR + ? std::make_shared<Node>(kind, current_column, paths[i]) + : std::make_shared<Node>(kind, current_column); + }); + } + + auto [column, type, _] = createTypeFromNode(tree.getRoot()); + return std::make_pair(std::move(column), std::move(type)); +} + +static void addConstantToWithClause(const ASTPtr & query, const String & column_name, const DataTypePtr & data_type) +{ + auto & select = query->as<ASTSelectQuery &>(); + if (!select.with()) + select.setExpression(ASTSelectQuery::Expression::WITH, std::make_shared<ASTExpressionList>()); + + /// TODO: avoid materialize + auto node = makeASTFunction("materialize", + makeASTFunction("CAST", + std::make_shared<ASTLiteral>(data_type->getDefault()), + std::make_shared<ASTLiteral>(data_type->getName()))); + + node->alias = column_name; + node->prefer_alias_to_column_name = true; + select.with()->children.push_back(std::move(node)); +} + +/// @expected_columns and @available_columns contain descriptions +/// of extended Object columns. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + ASTPtr query) +{ + NamesAndTypes missed_names_types; + + /// Find all subcolumns that are in @expected_columns, but not in @available_columns. + for (const auto & column : available_columns) + { + auto expected_column = expected_columns.getColumn(GetColumnsOptions::All, column.name); + + /// Extract all paths from both descriptions to easily check existence of subcolumns. + auto [available_paths, available_types] = flattenTuple(column.type); + auto [expected_paths, expected_types] = flattenTuple(expected_column.type); + + auto extract_names_and_types = [&column](const auto & paths, const auto & types) + { + NamesAndTypes res; + res.reserve(paths.size()); + for (size_t i = 0; i < paths.size(); ++i) + { + auto full_name = Nested::concatenateName(column.name, paths[i].getPath()); + res.emplace_back(full_name, types[i]); + } + + ::sort(res.begin(), res.end()); + return res; + }; + + auto available_names_types = extract_names_and_types(available_paths, available_types); + auto expected_names_types = extract_names_and_types(expected_paths, expected_types); + + std::set_difference( + expected_names_types.begin(), expected_names_types.end(), + available_names_types.begin(), available_names_types.end(), + std::back_inserter(missed_names_types), + [](const auto & lhs, const auto & rhs) { return lhs.name < rhs.name; }); + } + + if (missed_names_types.empty()) + return; + + IdentifierNameSet identifiers; + query->collectIdentifierNames(identifiers); + + /// Replace missed subcolumns to default literals of theirs type. + for (const auto & [name, type] : missed_names_types) + if (identifiers.contains(name)) + addConstantToWithClause(query, name, type); +} + +Field FieldVisitorReplaceScalars::operator()(const Array & x) const +{ + if (num_dimensions_to_keep == 0) + return replacement; + + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorReplaceScalars(replacement, num_dimensions_to_keep - 1), x[i]); + return res; +} + +size_t FieldVisitorToNumberOfDimensions::operator()(const Array & x) +{ + const size_t size = x.size(); + size_t dimensions = 0; + + for (size_t i = 0; i < size; ++i) + { + size_t element_dimensions = applyVisitor(*this, x[i]); + if (i > 0 && element_dimensions != dimensions) + need_fold_dimension = true; + + dimensions = std::max(dimensions, element_dimensions); + } + + return 1 + dimensions; +} + +Field FieldVisitorFoldDimension::operator()(const Array & x) const +{ + if (num_dimensions_to_fold == 0) + return x; + + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorFoldDimension(num_dimensions_to_fold - 1), x[i]); + + return res; +} + +void setAllObjectsToDummyTupleType(NamesAndTypesList & columns) +{ + for (auto & column : columns) + if (column.type->hasDynamicSubcolumns()) + column.type = createConcreteEmptyDynamicColumn(column.type); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/ObjectUtils.h b/contrib/clickhouse/src/DataTypes/ObjectUtils.h new file mode 100644 index 00000000000..2bfcaae09ca --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/ObjectUtils.h @@ -0,0 +1,212 @@ +#pragma once + +#include <Core/Block.h> +#include <Core/NamesAndTypes.h> +#include <Common/FieldVisitors.h> +#include <Storages/ColumnsDescription.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypesNumber.h> +#include <Columns/ColumnObject.h> + +namespace DB +{ + +struct StorageSnapshot; +using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>; + +/// Returns number of dimensions in Array type. 0 if type is not array. +size_t getNumberOfDimensions(const IDataType & type); + +/// Returns number of dimensions in Array column. 0 if column is not array. +size_t getNumberOfDimensions(const IColumn & column); + +/// Returns type of scalars of Array of arbitrary dimensions. +DataTypePtr getBaseTypeOfArray(const DataTypePtr & type); + +/// Returns Array type with requested scalar type and number of dimensions. +DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions); + +/// Returns column of scalars of Array of arbitrary dimensions. +ColumnPtr getBaseColumnOfArray(const ColumnPtr & column); + +/// Returns empty Array column with requested scalar column and number of dimensions. +ColumnPtr createArrayOfColumn(const ColumnPtr & column, size_t num_dimensions); + +/// Returns Array with requested number of dimensions and no scalars. +Array createEmptyArrayField(size_t num_dimensions); + +/// Tries to get data type by column. Only limited subset of types is supported +DataTypePtr getDataTypeByColumn(const IColumn & column); + +/// Converts Object types and columns to Tuples in @columns_list and @block +/// and checks that types are consistent with types in @storage_snapshot. +void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & storage_snapshot); + +/// Checks that each path is not the prefix of any other path. +void checkObjectHasNoAmbiguosPaths(const PathsInData & paths); + +/// Receives several Tuple types and deduces the least common type among them. +DataTypePtr getLeastCommonTypeForDynamicColumns( + const DataTypePtr & type_in_storage, const DataTypes & types, bool check_ambiguos_paths = false); + +DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage); + +/// Converts types of object columns to tuples in @columns_list +/// according to @object_columns and adds all tuple's subcolumns if needed. +void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns); + +/// Checks whether @columns contain any column with dynamic subcolumns. +bool hasDynamicSubcolumns(const ColumnsDescription & columns); + +/// Updates types of objects in @object_columns inplace +/// according to types in new_columns. +void updateObjectColumns( + ColumnsDescription & object_columns, + const ColumnsDescription & storage_columns, + const NamesAndTypesList & new_columns); + +using DataTypeTuplePtr = std::shared_ptr<DataTypeTuple>; + +/// Flattens nested Tuple to plain Tuple. I.e extracts all paths and types from tuple. +/// E.g. Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) -> Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) +std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type); + +/// Flattens nested Tuple column to plain Tuple column. +ColumnPtr flattenTuple(const ColumnPtr & column); + +/// The reverse operation to 'flattenTuple'. +/// Creates nested Tuple from all paths and types. +/// E.g. Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) -> Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) +DataTypePtr unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types); + +std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & column); + +std::pair<ColumnPtr, DataTypePtr> unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types, + const Columns & tuple_columns); + + +/// For all columns which exist in @expected_columns and +/// don't exist in @available_columns adds to WITH clause +/// an alias with column name to literal of default value of column type. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + ASTPtr query); + +/// Visitor that keeps @num_dimensions_to_keep dimensions in arrays +/// and replaces all scalars or nested arrays to @replacement at that level. +class FieldVisitorReplaceScalars : public StaticVisitor<Field> +{ +public: + FieldVisitorReplaceScalars(const Field & replacement_, size_t num_dimensions_to_keep_) + : replacement(replacement_), num_dimensions_to_keep(num_dimensions_to_keep_) + { + } + + Field operator()(const Array & x) const; + + template <typename T> + Field operator()(const T &) const { return replacement; } + +private: + const Field & replacement; + size_t num_dimensions_to_keep; +}; + +/// Calculates number of dimensions in array field. +/// Returns 0 for scalar fields. +class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> +{ +public: + size_t operator()(const Array & x); + + template <typename T> + size_t operator()(const T &) const { return 0; } + + bool need_fold_dimension = false; +}; + +/// Fold field (except Null) to the higher dimension, e.g. `1` -- fold 2 --> `[[1]]` +/// used to normalize dimension of element in an array. e.g [1, [2]] --> [[1], [2]] +class FieldVisitorFoldDimension : public StaticVisitor<Field> +{ +public: + explicit FieldVisitorFoldDimension(size_t num_dimensions_to_fold_) : num_dimensions_to_fold(num_dimensions_to_fold_) { } + + Field operator()(const Array & x) const; + + Field operator()(const Null & x) const { return x; } + + template <typename T> + Field operator()(const T & x) const + { + if (num_dimensions_to_fold == 0) + return x; + + Array res(1, x); + for (size_t i = 1; i < num_dimensions_to_fold; ++i) + { + Array new_res; + new_res.push_back(std::move(res)); + res = std::move(new_res); + } + + return res; + } + +private: + size_t num_dimensions_to_fold; +}; + +void setAllObjectsToDummyTupleType(NamesAndTypesList & columns); + +/// Receives range of objects, which contains collections +/// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList) +/// and deduces the common types of object columns for all entries. +/// @entry_columns_getter should extract reference to collection of +/// columns-like objects from entry to which Iterator points. +/// columns-like object should have fields "name" and "type". +template <typename Iterator, typename EntryColumnsGetter> +ColumnsDescription getConcreteObjectColumns( + Iterator begin, Iterator end, + const ColumnsDescription & storage_columns, + EntryColumnsGetter && entry_columns_getter) +{ + std::unordered_map<String, DataTypes> types_in_entries; + + /// Add dummy column for all Object columns + /// to not lose any column if it's missing + /// in all entries. If it exists in any entry + /// dummy column will be removed. + for (const auto & column : storage_columns) + { + if (column.type->hasDynamicSubcolumns()) + types_in_entries[column.name].push_back(createConcreteEmptyDynamicColumn(column.type)); + } + + for (auto it = begin; it != end; ++it) + { + const auto & entry_columns = entry_columns_getter(*it); + for (const auto & column : entry_columns) + { + auto storage_column = storage_columns.tryGetPhysical(column.name); + if (storage_column && storage_column->type->hasDynamicSubcolumns()) + types_in_entries[column.name].push_back(column.type); + } + } + + ColumnsDescription res; + for (const auto & [name, types] : types_in_entries) + { + auto storage_column = storage_columns.getPhysical(name); + res.add({name, getLeastCommonTypeForDynamicColumns(storage_column.type, types)}); + } + + return res; +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/ISerialization.cpp b/contrib/clickhouse/src/DataTypes/Serializations/ISerialization.cpp new file mode 100644 index 00000000000..782b890841a --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/ISerialization.cpp @@ -0,0 +1,323 @@ +#include <DataTypes/Serializations/ISerialization.h> +#include <Compression/CompressionFactory.h> +#include <Columns/IColumn.h> +#include <IO/WriteHelpers.h> +#include <IO/Operators.h> +#include <IO/ReadBufferFromString.h> +#include <Common/escapeForFileName.h> +#include <DataTypes/NestedUtils.h> +#include <base/EnumReflection.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int MULTIPLE_STREAMS_REQUIRED; + extern const int UNEXPECTED_DATA_AFTER_PARSED_VALUE; + extern const int LOGICAL_ERROR; +} + +ISerialization::Kind ISerialization::getKind(const IColumn & column) +{ + if (column.isSparse()) + return Kind::SPARSE; + + return Kind::DEFAULT; +} + +String ISerialization::kindToString(Kind kind) +{ + switch (kind) + { + case Kind::DEFAULT: + return "Default"; + case Kind::SPARSE: + return "Sparse"; + } + UNREACHABLE(); +} + +ISerialization::Kind ISerialization::stringToKind(const String & str) +{ + if (str == "Default") + return Kind::DEFAULT; + else if (str == "Sparse") + return Kind::SPARSE; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind '{}'", str); +} + +String ISerialization::Substream::toString() const +{ + if (type == TupleElement) + return fmt::format("TupleElement({}, escape_tuple_delimiter = {})", + tuple_element_name, escape_tuple_delimiter ? "true" : "false"); + + return String(magic_enum::enum_name(type)); +} + +String ISerialization::SubstreamPath::toString() const +{ + WriteBufferFromOwnString wb; + wb << "{"; + for (size_t i = 0; i < size(); ++i) + { + if (i != 0) + wb << ", "; + wb << at(i).toString(); + } + wb << "}"; + return wb.str(); +} + +void ISerialization::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + settings.path.push_back(Substream::Regular); + settings.path.back().data = data; + callback(settings.path); + settings.path.pop_back(); +} + +void ISerialization::enumerateStreams( + const StreamCallback & callback, + const DataTypePtr & type, + const ColumnPtr & column) const +{ + EnumerateStreamsSettings settings; + auto data = SubstreamData(getPtr()).withType(type).withColumn(column); + enumerateStreams(settings, callback, data); +} + +void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const +{ + throw Exception(ErrorCodes::MULTIPLE_STREAMS_REQUIRED, "Column {} must be serialized with multiple streams", column.getName()); +} + +void ISerialization::deserializeBinaryBulk(IColumn & column, ReadBuffer &, size_t, double) const +{ + throw Exception(ErrorCodes::MULTIPLE_STREAMS_REQUIRED, "Column {} must be deserialized with multiple streams", column.getName()); +} + +void ISerialization::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & /* state */) const +{ + if (WriteBuffer * stream = settings.getter(settings.path)) + serializeBinaryBulk(column, *stream, offset, limit); +} + +void ISerialization::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & /* state */, + SubstreamsCache * cache) const +{ + auto cached_column = getFromSubstreamsCache(cache, settings.path); + if (cached_column) + { + column = cached_column; + } + else if (ReadBuffer * stream = settings.getter(settings.path)) + { + auto mutable_column = column->assumeMutable(); + deserializeBinaryBulk(*mutable_column, *stream, limit, settings.avg_value_size_hint); + column = std::move(mutable_column); + addToSubstreamsCache(cache, settings.path, column); + } +} + +namespace +{ + +using SubstreamIterator = ISerialization::SubstreamPath::const_iterator; + +String getNameForSubstreamPath( + String stream_name, + SubstreamIterator begin, + SubstreamIterator end, + bool escape_tuple_delimiter) +{ + using Substream = ISerialization::Substream; + + size_t array_level = 0; + for (auto it = begin; it != end; ++it) + { + if (it->type == Substream::NullMap) + stream_name += ".null"; + else if (it->type == Substream::ArraySizes) + stream_name += ".size" + toString(array_level); + else if (it->type == Substream::ArrayElements) + ++array_level; + else if (it->type == Substream::DictionaryKeys) + stream_name += ".dict"; + else if (it->type == Substream::SparseOffsets) + stream_name += ".sparse.idx"; + else if (it->type == Substream::TupleElement) + { + /// For compatibility reasons, we use %2E (escaped dot) instead of dot. + /// Because nested data may be represented not by Array of Tuple, + /// but by separate Array columns with names in a form of a.b, + /// and name is encoded as a whole. + if (escape_tuple_delimiter && it->escape_tuple_delimiter) + stream_name += escapeForFileName("." + it->tuple_element_name); + else + stream_name += "." + it->tuple_element_name; + } + } + + return stream_name; +} + +} + +String ISerialization::getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path) +{ + return getFileNameForStream(column.getNameInStorage(), path); +} + +bool isOffsetsOfNested(const ISerialization::SubstreamPath & path) +{ + if (path.empty()) + return false; + + for (const auto & elem : path) + if (elem.type == ISerialization::Substream::ArrayElements) + return false; + + return path.back().type == ISerialization::Substream::ArraySizes; +} + +String ISerialization::getFileNameForStream(const String & name_in_storage, const SubstreamPath & path) +{ + String stream_name; + auto nested_storage_name = Nested::extractTableName(name_in_storage); + if (name_in_storage != nested_storage_name && isOffsetsOfNested(path)) + stream_name = escapeForFileName(nested_storage_name); + else + stream_name = escapeForFileName(name_in_storage); + + return getNameForSubstreamPath(std::move(stream_name), path.begin(), path.end(), true); +} + +String ISerialization::getSubcolumnNameForStream(const SubstreamPath & path) +{ + return getSubcolumnNameForStream(path, path.size()); +} + +String ISerialization::getSubcolumnNameForStream(const SubstreamPath & path, size_t prefix_len) +{ + auto subcolumn_name = getNameForSubstreamPath("", path.begin(), path.begin() + prefix_len, false); + if (!subcolumn_name.empty()) + subcolumn_name = subcolumn_name.substr(1); // It starts with a dot. + + return subcolumn_name; +} + +void ISerialization::addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column) +{ + if (!cache || path.empty()) + return; + + cache->emplace(getSubcolumnNameForStream(path), column); +} + +ColumnPtr ISerialization::getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path) +{ + if (!cache || path.empty()) + return nullptr; + + auto it = cache->find(getSubcolumnNameForStream(path)); + return it == cache->end() ? nullptr : it->second; +} + +bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) +{ + for (const auto & elem : path) + { + if (elem.type == Substream::NullMap + || elem.type == Substream::ArraySizes + || elem.type == Substream::DictionaryIndexes + || elem.type == Substream::SparseOffsets) + return false; + } + return true; +} + +void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + /// Read until \t or \n. + readString(field, istr); + ReadBufferFromString buf(field); + deserializeWholeText(column, buf, settings); +} + +void ISerialization::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +size_t ISerialization::getArrayLevel(const SubstreamPath & path) +{ + size_t level = 0; + for (const auto & elem : path) + level += elem.type == Substream::ArrayElements; + return level; +} + +bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len) +{ + if (prefix_len == 0 || prefix_len > path.size()) + return false; + + size_t last_elem = prefix_len - 1; + return path[last_elem].type == Substream::NullMap + || path[last_elem].type == Substream::TupleElement + || path[last_elem].type == Substream::ArraySizes; +} + +ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len) +{ + assert(prefix_len <= path.size()); + if (prefix_len == 0) + return {}; + + ssize_t last_elem = prefix_len - 1; + auto res = path[last_elem].data; + for (ssize_t i = last_elem - 1; i >= 0; --i) + { + const auto & creator = path[i].creator; + if (creator) + { + res.type = res.type ? creator->create(res.type) : res.type; + res.serialization = res.serialization ? creator->create(res.serialization) : res.serialization; + res.column = res.column ? creator->create(res.column) : res.column; + } + } + + return res; +} + +void ISerialization::throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const String & type_name) const +{ + WriteBufferFromOwnString ostr; + serializeText(column, column.size() - 1, ostr, settings); + throw Exception( + ErrorCodes::UNEXPECTED_DATA_AFTER_PARSED_VALUE, + "Unexpected data '{}' after parsed {} value '{}'", + std::string(istr.position(), std::min(size_t(10), istr.available())), + type_name, + ostr.str()); +} + +} + diff --git a/contrib/clickhouse/src/DataTypes/Serializations/ISerialization.h b/contrib/clickhouse/src/DataTypes/Serializations/ISerialization.h new file mode 100644 index 00000000000..17e6dfb85bc --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/ISerialization.h @@ -0,0 +1,417 @@ +#pragma once + +#include <Common/COW.h> +#include <Core/Types_fwd.h> +#include <base/demangle.h> +#include <Common/typeid_cast.h> +#include <Columns/IColumn.h> + +#include <boost/noncopyable.hpp> +#include <unordered_map> +#include <memory> +#include <variant> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +class IDataType; + +class ReadBuffer; +class WriteBuffer; +class ProtobufReader; +class ProtobufWriter; + +class IDataType; +using DataTypePtr = std::shared_ptr<const IDataType>; + +class ISerialization; +using SerializationPtr = std::shared_ptr<const ISerialization>; + +class SerializationInfo; +using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>; + +class Field; + +struct FormatSettings; +struct NameAndTypePair; + +/** Represents serialization of data type. + * Has methods to serialize/deserialize column in binary and several text formats. + * Every data type has default serialization, but can be serialized in different representations. + * Default serialization can be wrapped to one of the special kind of serializations. + * Currently there is only one special serialization: Sparse. + * Each serialization has its own implementation of IColumn as its in-memory representation. + */ +class ISerialization : private boost::noncopyable, public std::enable_shared_from_this<ISerialization> +{ +public: + ISerialization() = default; + virtual ~ISerialization() = default; + + enum class Kind : UInt8 + { + DEFAULT = 0, + SPARSE = 1, + }; + + virtual Kind getKind() const { return Kind::DEFAULT; } + SerializationPtr getPtr() const { return shared_from_this(); } + + static Kind getKind(const IColumn & column); + static String kindToString(Kind kind); + static Kind stringToKind(const String & str); + + /** Binary serialization for range of values in column - for writing to disk/network, etc. + * + * Some data types are represented in multiple streams while being serialized. + * Example: + * - Arrays are represented as stream of all elements and stream of array sizes. + * - Nullable types are represented as stream of values (with unspecified values in place of NULLs) and stream of NULL flags. + * + * Different streams are identified by "path". + * If the data type require single stream (it's true for most of data types), the stream will have empty path. + * Otherwise, the path can have components like "array elements", "array sizes", etc. + * + * For multidimensional arrays, path can have arbitrary length. + * As an example, for 2-dimensional arrays of numbers we have at least three streams: + * - array sizes; (sizes of top level arrays) + * - array elements / array sizes; (sizes of second level (nested) arrays) + * - array elements / array elements; (the most deep elements, placed contiguously) + * + * Descendants must override either serializeBinaryBulk, deserializeBinaryBulk methods (for simple cases with single stream) + * or serializeBinaryBulkWithMultipleStreams, deserializeBinaryBulkWithMultipleStreams, enumerateStreams methods (for cases with multiple streams). + * + * Default implementations of ...WithMultipleStreams methods will call serializeBinaryBulk, deserializeBinaryBulk for single stream. + */ + + struct ISubcolumnCreator + { + virtual DataTypePtr create(const DataTypePtr & prev) const = 0; + virtual SerializationPtr create(const SerializationPtr & prev) const = 0; + virtual ColumnPtr create(const ColumnPtr & prev) const = 0; + virtual ~ISubcolumnCreator() = default; + }; + + using SubcolumnCreatorPtr = std::shared_ptr<const ISubcolumnCreator>; + + struct SubstreamData + { + SubstreamData() = default; + SubstreamData(SerializationPtr serialization_) + : serialization(std::move(serialization_)) + { + } + + SubstreamData & withType(DataTypePtr type_) + { + type = std::move(type_); + return *this; + } + + SubstreamData & withColumn(ColumnPtr column_) + { + column = std::move(column_); + return *this; + } + + SubstreamData & withSerializationInfo(SerializationInfoPtr serialization_info_) + { + serialization_info = std::move(serialization_info_); + return *this; + } + + SerializationPtr serialization; + DataTypePtr type; + ColumnPtr column; + SerializationInfoPtr serialization_info; + }; + + struct Substream + { + enum Type + { + ArrayElements, + ArraySizes, + + NullableElements, + NullMap, + + TupleElement, + + DictionaryKeys, + DictionaryIndexes, + + SparseElements, + SparseOffsets, + + ObjectStructure, + ObjectData, + + Regular, + }; + + Type type; + + /// Index of tuple element, starting at 1 or name. + String tuple_element_name; + + /// Do we need to escape a dot in filenames for tuple elements. + bool escape_tuple_delimiter = true; + + /// Data for current substream. + SubstreamData data; + + /// Creator of subcolumn for current substream. + SubcolumnCreatorPtr creator = nullptr; + + /// Flag, that may help to traverse substream paths. + mutable bool visited = false; + + Substream(Type type_) : type(type_) {} /// NOLINT + + String toString() const; + }; + + struct SubstreamPath : public std::vector<Substream> + { + String toString() const; + }; + + /// Cache for common substreams of one type, but possible different its subcolumns. + /// E.g. sizes of arrays of Nested data type. + using SubstreamsCache = std::unordered_map<String, ColumnPtr>; + + using StreamCallback = std::function<void(const SubstreamPath &)>; + + struct EnumerateStreamsSettings + { + SubstreamPath path; + bool position_independent_encoding = true; + }; + + virtual void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const; + + /// Enumerate streams with default settings. + void enumerateStreams( + const StreamCallback & callback, + const DataTypePtr & type = nullptr, + const ColumnPtr & column = nullptr) const; + + using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>; + using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>; + + struct SerializeBinaryBulkState + { + virtual ~SerializeBinaryBulkState() = default; + }; + + struct DeserializeBinaryBulkState + { + virtual ~DeserializeBinaryBulkState() = default; + }; + + using SerializeBinaryBulkStatePtr = std::shared_ptr<SerializeBinaryBulkState>; + using DeserializeBinaryBulkStatePtr = std::shared_ptr<DeserializeBinaryBulkState>; + + struct SerializeBinaryBulkSettings + { + OutputStreamGetter getter; + SubstreamPath path; + + size_t low_cardinality_max_dictionary_size = 0; + bool low_cardinality_use_single_dictionary_for_part = true; + + bool position_independent_encoding = true; + }; + + struct DeserializeBinaryBulkSettings + { + InputStreamGetter getter; + SubstreamPath path; + + /// True if continue reading from previous positions in file. False if made fseek to the start of new granule. + bool continuous_reading = true; + + bool position_independent_encoding = true; + + bool native_format = false; + + /// If not zero, may be used to avoid reallocations while reading column of String type. + double avg_value_size_hint = 0; + }; + + /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. + /// Column may be used only to retrieve the structure. + virtual void serializeBinaryBulkStatePrefix( + const IColumn & /*column*/, + SerializeBinaryBulkSettings & /*settings*/, + SerializeBinaryBulkStatePtr & /*state*/) const {} + + /// Call after serializeBinaryBulkWithMultipleStreams chain to finish serialization. + virtual void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & /*settings*/, + SerializeBinaryBulkStatePtr & /*state*/) const {} + + /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. + virtual void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & /*settings*/, + DeserializeBinaryBulkStatePtr & /*state*/) const {} + + /** 'offset' and 'limit' are used to specify range. + * limit = 0 - means no limit. + * offset must be not greater than size of column. + * offset + limit could be greater than size of column + * - in that case, column is serialized till the end. + */ + virtual void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const; + + /// Read no more than limit values and append them into column. + virtual void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const; + + /** Override these methods for data types that require just single stream (most of data types). + */ + virtual void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const; + virtual void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const; + + /** Serialization/deserialization of individual values. + * + * These are helper methods for implementation of various formats to input/output for user (like CSV, JSON, etc.). + * There is no one-to-one correspondence between formats and these methods. + * For example, TabSeparated and Pretty formats could use same helper method serializeTextEscaped. + * + * For complex data types (like arrays) binary serde for individual values may differ from bulk serde. + * For example, if you serialize single array, it will be represented as its size and elements in single contiguous stream, + * but if you bulk serialize column with arrays, then sizes and elements will be written to separate streams. + */ + + /// There is two variants for binary serde. First variant work with Field. + virtual void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const = 0; + virtual void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const = 0; + + /// Other variants takes a column, to avoid creating temporary Field object. + /// Column must be non-constant. + + /// Serialize one value of a column at specified row number. + virtual void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + /// Deserialize one value and insert into a column. + /// If method will throw an exception, then column will be in same state as before call to method. + virtual void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization with escaping but without quoting. + */ + virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization as a literal that may be inserted into a query. + */ + virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for the CSV format. + */ + virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization for displaying on a terminal or saving into a text file, and the like. + * Without escaping or quoting. + */ + virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + + /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. + */ + virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + + /** Text serialization intended for using in JSON format. + */ + virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; + virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t /*indent*/) const + { + serializeTextJSON(column, row_num, ostr, settings); + } + + + /** Text serialization for putting into the XML format. + */ + virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const + { + serializeText(column, row_num, ostr, settings); + } + + /** Text deserialization without escaping and quoting. Reads all data until first \n or \t + * into a temporary string and then call deserializeWholeText. It was implemented this way + * because this function is rarely used and because proper implementation requires a lot of + * additional code in data types serialization and ReadHelpers. + */ + virtual void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + virtual void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + + static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); + static String getFileNameForStream(const String & name_in_storage, const SubstreamPath & path); + static String getSubcolumnNameForStream(const SubstreamPath & path); + static String getSubcolumnNameForStream(const SubstreamPath & path, size_t prefix_len); + + static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); + static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); + + static bool isSpecialCompressionAllowed(const SubstreamPath & path); + + static size_t getArrayLevel(const SubstreamPath & path); + static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len); + static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len); + +protected: + template <typename State, typename StatePtr> + State * checkAndGetState(const StatePtr & state) const; + + [[noreturn]] void throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings &, const String & type_name) const; +}; + +using SerializationPtr = std::shared_ptr<const ISerialization>; +using Serializations = std::vector<SerializationPtr>; +using SerializationByName = std::unordered_map<String, SerializationPtr>; + +template <typename State, typename StatePtr> +State * ISerialization::checkAndGetState(const StatePtr & state) const +{ + if (!state) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Got empty state for {}", demangle(typeid(*this).name())); + + auto * state_concrete = typeid_cast<State *>(state.get()); + if (!state_concrete) + { + auto & state_ref = *state; + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Invalid State for {}. Expected: {}, got {}", + demangle(typeid(*this).name()), + demangle(typeid(State).name()), + demangle(typeid(state_ref).name())); + } + + return state_concrete; +} + +bool isOffsetsOfNested(const ISerialization::SubstreamPath & path); + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/JSONDataParser.cpp b/contrib/clickhouse/src/DataTypes/Serializations/JSONDataParser.cpp new file mode 100644 index 00000000000..56641424396 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/JSONDataParser.cpp @@ -0,0 +1,281 @@ +#include <DataTypes/Serializations/JSONDataParser.h> +#include <Common/JSONParsers/SimdJSONParser.h> +#include <Common/JSONParsers/RapidJSONParser.h> +#include <Common/checkStackSize.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +template <typename ParserImpl> +std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char * begin, size_t length) +{ + std::string_view json{begin, length}; + Element document; + if (!parser.parse(json, document)) + return {}; + + ParseContext context; + traverse(document, context); + + ParseResult result; + result.values = std::move(context.values); + result.paths.reserve(context.paths.size()); + + for (auto && path : context.paths) + result.paths.emplace_back(std::move(path)); + + return result; +} + +template <typename ParserImpl> +void JSONDataParser<ParserImpl>::traverse(const Element & element, ParseContext & ctx) +{ + checkStackSize(); + + if (element.isObject()) + { + traverseObject(element.getObject(), ctx); + } + else if (element.isArray()) + { + traverseArray(element.getArray(), ctx); + } + else + { + ctx.paths.push_back(ctx.builder.getParts()); + ctx.values.push_back(getValueAsField(element)); + } +} + +template <typename ParserImpl> +void JSONDataParser<ParserImpl>::traverseObject(const JSONObject & object, ParseContext & ctx) +{ + ctx.paths.reserve(ctx.paths.size() + object.size()); + ctx.values.reserve(ctx.values.size() + object.size()); + + for (auto it = object.begin(); it != object.end(); ++it) + { + const auto & [key, value] = *it; + ctx.builder.append(key, false); + traverse(value, ctx); + ctx.builder.popBack(); + } +} + +template <typename ParserImpl> +void JSONDataParser<ParserImpl>::traverseArray(const JSONArray & array, ParseContext & ctx) +{ + /// Traverse elements of array and collect an array of fields by each path. + ParseArrayContext array_ctx; + array_ctx.total_size = array.size(); + + for (auto it = array.begin(); it != array.end(); ++it) + { + traverseArrayElement(*it, array_ctx); + ++array_ctx.current_size; + } + + auto && arrays_by_path = array_ctx.arrays_by_path; + + if (arrays_by_path.empty()) + { + ctx.paths.push_back(ctx.builder.getParts()); + ctx.values.push_back(Array()); + } + else + { + ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size()); + ctx.values.reserve(ctx.values.size() + arrays_by_path.size()); + + for (auto && [_, value] : arrays_by_path) + { + auto && [path, path_array] = value; + + /// Merge prefix path and path of array element. + ctx.paths.push_back(ctx.builder.append(path, true).getParts()); + ctx.values.push_back(std::move(path_array)); + ctx.builder.popBack(path.size()); + } + } +} + +template <typename ParserImpl> +void JSONDataParser<ParserImpl>::traverseArrayElement(const Element & element, ParseArrayContext & ctx) +{ + ParseContext element_ctx; + traverse(element, element_ctx); + + auto & [_, paths, values] = element_ctx; + size_t size = paths.size(); + size_t keys_to_update = ctx.arrays_by_path.size(); + + for (size_t i = 0; i < size; ++i) + { + if (values[i].isNull()) + continue; + + UInt128 hash = PathInData::getPartsHash(paths[i].begin(), paths[i].end()); + if (auto * found = ctx.arrays_by_path.find(hash)) + { + auto & path_array = found->getMapped().second; + assert(path_array.size() == ctx.current_size); + + /// If current element of array is part of Nested, + /// collect its size or check it if the size of + /// the Nested has been already collected. + auto nested_hash = getHashOfNestedPath(paths[i], values[i]); + if (nested_hash) + { + size_t array_size = values[i].template get<const Array &>().size(); + auto & current_nested_sizes = ctx.nested_sizes_by_path[*nested_hash]; + + if (current_nested_sizes.size() == ctx.current_size) + current_nested_sizes.push_back(array_size); + else if (array_size != current_nested_sizes.back()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Array sizes mismatched ({} and {})", array_size, current_nested_sizes.back()); + } + + path_array.push_back(std::move(values[i])); + --keys_to_update; + } + else + { + /// We found a new key. Add and empty array with current size. + Array path_array; + path_array.reserve(ctx.total_size); + path_array.resize(ctx.current_size); + + auto nested_hash = getHashOfNestedPath(paths[i], values[i]); + if (nested_hash) + { + size_t array_size = values[i].template get<const Array &>().size(); + auto & current_nested_sizes = ctx.nested_sizes_by_path[*nested_hash]; + + if (current_nested_sizes.empty()) + { + current_nested_sizes.resize(ctx.current_size); + } + else + { + /// If newly added element is part of the Nested then + /// resize its elements to keep correct sizes of Nested arrays. + for (size_t j = 0; j < ctx.current_size; ++j) + path_array[j] = Array(current_nested_sizes[j]); + } + + if (current_nested_sizes.size() == ctx.current_size) + current_nested_sizes.push_back(array_size); + else if (array_size != current_nested_sizes.back()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Array sizes mismatched ({} and {})", array_size, current_nested_sizes.back()); + } + + path_array.push_back(std::move(values[i])); + + auto & elem = ctx.arrays_by_path[hash]; + elem.first = std::move(paths[i]); + elem.second = std::move(path_array); + } + } + + /// If some of the keys are missed in current element, + /// add default values for them. + if (keys_to_update) + fillMissedValuesInArrays(ctx); +} + +template <typename ParserImpl> +void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext & ctx) +{ + for (auto & [_, value] : ctx.arrays_by_path) + { + auto & [path, path_array] = value; + assert(path_array.size() == ctx.current_size || path_array.size() == ctx.current_size + 1); + + if (path_array.size() == ctx.current_size) + { + bool inserted = tryInsertDefaultFromNested(ctx, path, path_array); + if (!inserted) + path_array.emplace_back(); + } + } +} + +template <typename ParserImpl> +bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested( + ParseArrayContext & ctx, const PathInData::Parts & path, Array & array) +{ + /// If there is a collected size of current Nested + /// then insert array of this size as a default value. + if (path.empty() || array.empty()) + return false; + + /// Last element is not Null, because otherwise this path wouldn't exist. + auto hash = getHashOfNestedPath(path, array.back()); + if (!hash) + return false; + + auto * mapped = ctx.nested_sizes_by_path.find(*hash); + if (!mapped) + return false; + + auto & current_nested_sizes = mapped->getMapped(); + assert(current_nested_sizes.size() == ctx.current_size || current_nested_sizes.size() == ctx.current_size + 1); + + /// If all keys of Nested were missed then add a zero length. + if (current_nested_sizes.size() == ctx.current_size) + current_nested_sizes.push_back(0); + + size_t array_size = current_nested_sizes.back(); + array.push_back(Array(array_size)); + return true; +} + +template <typename ParserImpl> +Field JSONDataParser<ParserImpl>::getValueAsField(const Element & element) +{ + if (element.isBool()) return element.getBool(); + if (element.isInt64()) return element.getInt64(); + if (element.isUInt64()) return element.getUInt64(); + if (element.isDouble()) return element.getDouble(); + if (element.isString()) return element.getString(); + if (element.isNull()) return Field(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported type of JSON field"); +} + +template <typename ParserImpl> +std::optional<UInt128> JSONDataParser<ParserImpl>::getHashOfNestedPath(const PathInData::Parts & path, const Field & value) +{ + if (value.getType() != Field::Types::Array || path.empty()) + return {}; + + /// Find first key that is marked as nested and return hash of its path. + /// It's needed because we may have tuple of Nested and there could be + /// several arrays with the same prefix, but with independent sizes. + /// Consider we have array element with type `k2 Tuple(k3 Nested(...), k5 Nested(...))` + /// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract + /// `k3` and `k5` keys instead of `k2`. + + for (size_t i = 0; i != path.size(); ++i) + if (path[i].is_nested) + return PathInData::getPartsHash(path.begin(), std::next(path.begin(), i + 1)); + + return {}; +} + +#if USE_SIMDJSON + template class JSONDataParser<SimdJSONParser>; +#endif + +#if USE_RAPIDJSON + template class JSONDataParser<RapidJSONParser>; +#endif + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/JSONDataParser.h b/contrib/clickhouse/src/DataTypes/Serializations/JSONDataParser.h new file mode 100644 index 00000000000..b22014df72a --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/JSONDataParser.h @@ -0,0 +1,64 @@ +#pragma once + +#include <IO/ReadHelpers.h> +#include <Common/HashTable/HashMap.h> +#include <DataTypes/Serializations/PathInData.h> + +namespace DB +{ + +class ReadBuffer; + +template <typename ParserImpl> +class JSONDataParser +{ +public: + static void readJSON(String & s, ReadBuffer & buf) + { + readJSONObjectPossiblyInvalid(s, buf); + } + + std::optional<ParseResult> parse(const char * begin, size_t length); + +private: + using Element = typename ParserImpl::Element; + using JSONObject = typename ParserImpl::Object; + using JSONArray = typename ParserImpl::Array; + + struct ParseContext + { + PathInDataBuilder builder; + std::vector<PathInData::Parts> paths; + std::vector<Field> values; + }; + + using PathPartsWithArray = std::pair<PathInData::Parts, Array>; + using PathToArray = HashMapWithStackMemory<UInt128, PathPartsWithArray, UInt128TrivialHash, 5>; + using PathToSizes = HashMapWithStackMemory<UInt128, std::vector<size_t>, UInt128TrivialHash, 5>; + + struct ParseArrayContext + { + size_t current_size = 0; + size_t total_size = 0; + + PathToArray arrays_by_path; + PathToSizes nested_sizes_by_path; + Arena strings_pool; + }; + + void traverse(const Element & element, ParseContext & ctx); + void traverseObject(const JSONObject & object, ParseContext & ctx); + void traverseArray(const JSONArray & array, ParseContext & ctx); + void traverseArrayElement(const Element & element, ParseArrayContext & ctx); + + static void fillMissedValuesInArrays(ParseArrayContext & ctx); + static bool tryInsertDefaultFromNested( + ParseArrayContext & ctx, const PathInData::Parts & path, Array & array); + + static Field getValueAsField(const Element & element); + static std::optional<UInt128> getHashOfNestedPath(const PathInData::Parts & path, const Field & value); + + ParserImpl parser; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/PathInData.cpp b/contrib/clickhouse/src/DataTypes/Serializations/PathInData.cpp new file mode 100644 index 00000000000..cf78d7cbb14 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/PathInData.cpp @@ -0,0 +1,156 @@ +#include <DataTypes/Serializations/PathInData.h> +#include <DataTypes/NestedUtils.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeArray.h> +#include <Columns/ColumnTuple.h> +#include <Columns/ColumnArray.h> +#include <Common/SipHash.h> + +#include <boost/algorithm/string/split.hpp> +#include <boost/algorithm/string.hpp> + +namespace DB +{ + +PathInData::PathInData(std::string_view path_) + : path(path_) +{ + const char * begin = path.data(); + const char * end = path.data() + path.size(); + + for (const char * it = path.data(); it != end; ++it) + { + if (*it == '.') + { + size_t size = static_cast<size_t>(it - begin); + parts.emplace_back(std::string_view{begin, size}, false, 0); + begin = it + 1; + } + } + + size_t size = static_cast<size_t>(end - begin); + parts.emplace_back(std::string_view{begin, size}, false, 0.); +} + +PathInData::PathInData(const Parts & parts_) +{ + buildPath(parts_); + buildParts(parts_); +} + +PathInData::PathInData(const PathInData & other) + : path(other.path) +{ + buildParts(other.getParts()); +} + +PathInData & PathInData::operator=(const PathInData & other) +{ + if (this != &other) + { + path = other.path; + buildParts(other.parts); + } + return *this; +} + +UInt128 PathInData::getPartsHash(const Parts::const_iterator & begin, const Parts::const_iterator & end) +{ + SipHash hash; + hash.update(std::distance(begin, end)); + for (auto part_it = begin; part_it != end; ++part_it) + { + hash.update(part_it->key.data(), part_it->key.length()); + hash.update(part_it->is_nested); + hash.update(part_it->anonymous_array_level); + } + + return hash.get128(); +} + +void PathInData::buildPath(const Parts & other_parts) +{ + if (other_parts.empty()) + return; + + path.clear(); + auto it = other_parts.begin(); + path += it->key; + ++it; + for (; it != other_parts.end(); ++it) + { + path += "."; + path += it->key; + } +} + +void PathInData::buildParts(const Parts & other_parts) +{ + if (other_parts.empty()) + return; + + parts.clear(); + parts.reserve(other_parts.size()); + const char * begin = path.data(); + for (const auto & part : other_parts) + { + has_nested |= part.is_nested; + parts.emplace_back(std::string_view{begin, part.key.length()}, part.is_nested, part.anonymous_array_level); + begin += part.key.length() + 1; + } +} + +size_t PathInData::Hash::operator()(const PathInData & value) const +{ + auto hash = getPartsHash(value.parts.begin(), value.parts.end()); + return hash.items[0] ^ hash.items[1]; +} + +PathInDataBuilder & PathInDataBuilder::append(std::string_view key, bool is_array) +{ + if (parts.empty()) + current_anonymous_array_level += is_array; + + if (!key.empty()) + { + if (!parts.empty()) + parts.back().is_nested = is_array; + + parts.emplace_back(key, false, current_anonymous_array_level); + current_anonymous_array_level = 0; + } + + return *this; +} + +PathInDataBuilder & PathInDataBuilder::append(const PathInData::Parts & path, bool is_array) +{ + if (parts.empty()) + current_anonymous_array_level += is_array; + + if (!path.empty()) + { + if (!parts.empty()) + parts.back().is_nested = is_array; + + auto it = parts.insert(parts.end(), path.begin(), path.end()); + for (; it != parts.end(); ++it) + it->anonymous_array_level += current_anonymous_array_level; + current_anonymous_array_level = 0; + } + + return *this; +} + +void PathInDataBuilder::popBack() +{ + parts.pop_back(); +} + +void PathInDataBuilder::popBack(size_t n) +{ + assert(n <= parts.size()); + parts.resize(parts.size() - n); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/PathInData.h b/contrib/clickhouse/src/DataTypes/Serializations/PathInData.h new file mode 100644 index 00000000000..5624348bee3 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/PathInData.h @@ -0,0 +1,110 @@ +#pragma once + +#include <Core/Types.h> +#include <Core/Field.h> +#include <bitset> + +namespace DB +{ + +/// Class that represents path in document, e.g. JSON. +class PathInData +{ +public: + struct Part + { + Part() = default; + Part(std::string_view key_, bool is_nested_, UInt8 anonymous_array_level_) + : key(key_), is_nested(is_nested_), anonymous_array_level(anonymous_array_level_) + { + } + + /// Name of part of path. + std::string_view key; + + /// If this part is Nested, i.e. element + /// related to this key is the array of objects. + bool is_nested = false; + + /// Number of array levels between current key and previous key. + /// E.g. in JSON {"k1": [[[{"k2": 1, "k3": 2}]]]} + /// "k1" is nested and has anonymous_array_level = 0. + /// "k2" and "k3" are not nested and have anonymous_array_level = 2. + UInt8 anonymous_array_level = 0; + + bool operator==(const Part & other) const = default; + }; + + using Parts = std::vector<Part>; + + PathInData() = default; + explicit PathInData(std::string_view path_); + explicit PathInData(const Parts & parts_); + + PathInData(const PathInData & other); + PathInData & operator=(const PathInData & other); + + static UInt128 getPartsHash(const Parts::const_iterator & begin, const Parts::const_iterator & end); + + bool empty() const { return parts.empty(); } + + const String & getPath() const { return path; } + const Parts & getParts() const { return parts; } + + bool isNested(size_t i) const { return parts[i].is_nested; } + bool hasNested() const { return has_nested; } + + bool operator==(const PathInData & other) const { return parts == other.parts; } + struct Hash { size_t operator()(const PathInData & value) const; }; + +private: + /// Creates full path from parts. + void buildPath(const Parts & other_parts); + + /// Creates new parts full from full path with correct string pointers. + void buildParts(const Parts & other_parts); + + /// The full path. Parts are separated by dots. + String path; + + /// Parts of the path. All string_view-s in parts must point to the @path. + Parts parts; + + /// True if at least one part is nested. + /// Cached to avoid linear complexity at 'hasNested'. + bool has_nested = false; +}; + +class PathInDataBuilder +{ +public: + const PathInData::Parts & getParts() const { return parts; } + + PathInDataBuilder & append(std::string_view key, bool is_array); + PathInDataBuilder & append(const PathInData::Parts & path, bool is_array); + + void popBack(); + void popBack(size_t n); + +private: + PathInData::Parts parts; + + /// Number of array levels without key to which + /// next non-empty key will be nested. + /// Example: for JSON { "k1": [[{"k2": 1, "k3": 2}] } + // `k2` and `k3` has anonymous_array_level = 1 in that case. + size_t current_anonymous_array_level = 0; +}; + +using PathsInData = std::vector<PathInData>; + +/// Result of parsing of a document. +/// Contains all paths extracted from document +/// and values which are related to them. +struct ParseResult +{ + std::vector<PathInData> paths; + std::vector<Field> values; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationAggregateFunction.cpp new file mode 100644 index 00000000000..c482c9623e9 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -0,0 +1,218 @@ +#include <DataTypes/Serializations/SerializationAggregateFunction.h> + +#include <IO/WriteHelpers.h> + +#include <Columns/ColumnAggregateFunction.h> + +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> +#include <Common/AlignedBuffer.h> +#include <Common/Arena.h> + +#include <Formats/FormatSettings.h> +#include <Formats/ProtobufReader.h> +#include <Formats/ProtobufWriter.h> +#include <IO/WriteBufferFromString.h> +#include <IO/Operators.h> + +namespace DB +{ + +void SerializationAggregateFunction::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + const AggregateFunctionStateData & state = field.get<const AggregateFunctionStateData &>(); + writeBinary(state.data, ostr); +} + +void SerializationAggregateFunction::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const +{ + field = AggregateFunctionStateData(); + AggregateFunctionStateData & s = field.get<AggregateFunctionStateData &>(); + readBinary(s.data, istr); + s.name = type_name; +} + +void SerializationAggregateFunction::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + function->serialize(assert_cast<const ColumnAggregateFunction &>(column).getData()[row_num], ostr, version); +} + +void SerializationAggregateFunction::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ColumnAggregateFunction & column_concrete = assert_cast<ColumnAggregateFunction &>(column); + + Arena & arena = column_concrete.createOrGetArena(); + size_t size_of_state = function->sizeOfData(); + AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); + + function->create(place); + try + { + function->deserialize(place, istr, version, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + column_concrete.getData().push_back(place); +} + +void SerializationAggregateFunction::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnAggregateFunction & real_column = typeid_cast<const ColumnAggregateFunction &>(column); + const ColumnAggregateFunction::Container & vec = real_column.getData(); + + ColumnAggregateFunction::Container::const_iterator it = vec.begin() + offset; + ColumnAggregateFunction::Container::const_iterator end = limit ? it + limit : vec.end(); + + if (end > vec.end()) + end = vec.end(); + + for (; it != end; ++it) + function->serialize(*it, ostr, version); +} + +void SerializationAggregateFunction::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + ColumnAggregateFunction & real_column = typeid_cast<ColumnAggregateFunction &>(column); + ColumnAggregateFunction::Container & vec = real_column.getData(); + + Arena & arena = real_column.createOrGetArena(); + real_column.set(function, version); + vec.reserve(vec.size() + limit); + + size_t size_of_state = function->sizeOfData(); + size_t align_of_state = function->alignOfData(); + + for (size_t i = 0; i < limit; ++i) + { + if (istr.eof()) + break; + + AggregateDataPtr place = arena.alignedAlloc(size_of_state, align_of_state); + + function->create(place); + + try + { + function->deserialize(place, istr, version, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + vec.push_back(place); + } +} + +static String serializeToString(const AggregateFunctionPtr & function, const IColumn & column, size_t row_num, size_t version) +{ + WriteBufferFromOwnString buffer; + function->serialize(assert_cast<const ColumnAggregateFunction &>(column).getData()[row_num], buffer, version); + return buffer.str(); +} + +static void deserializeFromString(const AggregateFunctionPtr & function, IColumn & column, const String & s, size_t version) +{ + ColumnAggregateFunction & column_concrete = assert_cast<ColumnAggregateFunction &>(column); + + Arena & arena = column_concrete.createOrGetArena(); + size_t size_of_state = function->sizeOfData(); + AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); + + function->create(place); + + try + { + ReadBufferFromString istr(s); + function->deserialize(place, istr, version, &arena); + } + catch (...) + { + function->destroy(place); + throw; + } + + column_concrete.getData().push_back(place); +} + +void SerializationAggregateFunction::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(serializeToString(function, column, row_num, version), ostr); +} + + +void SerializationAggregateFunction::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(serializeToString(function, column, row_num, version), ostr); +} + + +void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readEscapedString(s, istr); + deserializeFromString(function, column, s, version); +} + + +void SerializationAggregateFunction::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(serializeToString(function, column, row_num, version), ostr); +} + + +void SerializationAggregateFunction::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readQuotedStringWithSQLStyle(s, istr); + deserializeFromString(function, column, s, version); +} + + +void SerializationAggregateFunction::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readStringUntilEOF(s, istr); + deserializeFromString(function, column, s, version); +} + + +void SerializationAggregateFunction::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(serializeToString(function, column, row_num, version), ostr, settings); +} + + +void SerializationAggregateFunction::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + String s; + readJSONString(s, istr); + deserializeFromString(function, column, s, version); +} + + +void SerializationAggregateFunction::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(serializeToString(function, column, row_num, version), ostr); +} + + +void SerializationAggregateFunction::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSV(serializeToString(function, column, row_num, version), ostr); +} + + +void SerializationAggregateFunction::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + deserializeFromString(function, column, s, version); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationAggregateFunction.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationAggregateFunction.h new file mode 100644 index 00000000000..4212298bbc1 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationAggregateFunction.h @@ -0,0 +1,46 @@ +#pragma once + +#include <AggregateFunctions/IAggregateFunction.h> + +#include <DataTypes/Serializations/ISerialization.h> + + +namespace DB +{ + +class SerializationAggregateFunction final : public ISerialization +{ +private: + AggregateFunctionPtr function; + String type_name; + size_t version; + +public: + static constexpr bool is_parametric = true; + + SerializationAggregateFunction(const AggregateFunctionPtr & function_, String type_name_, size_t version_) + : function(function_), type_name(std::move(type_name_)), version(version_) {} + + /// NOTE These two functions for serializing single values are incompatible with the functions below. + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationArray.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationArray.cpp new file mode 100644 index 00000000000..e01c1aea0e9 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationArray.cpp @@ -0,0 +1,620 @@ +#include <DataTypes/Serializations/SerializationArray.h> +#include <DataTypes/Serializations/SerializationNullable.h> +#include <DataTypes/Serializations/SerializationNumber.h> +#include <DataTypes/Serializations/SerializationNamed.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypesNumber.h> +#include <Columns/ColumnArray.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/ReadBufferFromString.h> +#include <IO/WriteBufferFromString.h> + +#include <Formats/FormatSettings.h> +#include <Formats/ProtobufReader.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_READ_ARRAY_FROM_TEXT; + extern const int LOGICAL_ERROR; + extern const int TOO_LARGE_ARRAY_SIZE; +} + +static constexpr size_t MAX_ARRAY_SIZE = 1ULL << 30; +static constexpr size_t MAX_ARRAYS_SIZE = 1ULL << 40; + + +void SerializationArray::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const Array & a = field.get<const Array &>(); + writeVarUInt(a.size(), ostr); + for (const auto & i : a) + { + nested->serializeBinary(i, ostr, settings); + } +} + + +void SerializationArray::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + size_t size; + readVarUInt(size, istr); + if (settings.max_binary_array_size && size > settings.max_binary_array_size) + throw Exception( + ErrorCodes::TOO_LARGE_ARRAY_SIZE, + "Too large array size: {}. The maximum is: {}. To increase the maximum, use setting " + "format_binary_max_array_size", + size, + settings.max_binary_array_size); + + field = Array(); + Array & arr = field.get<Array &>(); + arr.reserve(size); + for (size_t i = 0; i < size; ++i) + nested->deserializeBinary(arr.emplace_back(), istr, settings); +} + + +void SerializationArray::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnArray & column_array = assert_cast<const ColumnArray &>(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + size_t size = next_offset - offset; + + writeVarUInt(size, ostr); + + const IColumn & nested_column = column_array.getData(); + for (size_t i = offset; i < next_offset; ++i) + nested->serializeBinary(nested_column, i, ostr, settings); +} + + +void SerializationArray::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnArray & column_array = assert_cast<ColumnArray &>(column); + ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t size; + readVarUInt(size, istr); + if (settings.max_binary_array_size && size > settings.max_binary_array_size) + throw Exception( + ErrorCodes::TOO_LARGE_ARRAY_SIZE, + "Too large array size: {}. The maximum is: {}. To increase the maximum, use setting " + "format_binary_max_array_size", + size, + settings.max_binary_array_size); + + IColumn & nested_column = column_array.getData(); + + size_t i = 0; + try + { + for (; i < size; ++i) + nested->deserializeBinary(nested_column, istr, settings); + } + catch (...) + { + if (i) + nested_column.popBack(i); + throw; + } + + offsets.push_back(offsets.back() + size); +} + + +namespace +{ + void serializeArraySizesPositionIndependent(const IColumn & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) + { + const ColumnArray & column_array = typeid_cast<const ColumnArray &>(column); + const ColumnArray::Offsets & offset_values = column_array.getOffsets(); + size_t size = offset_values.size(); + + if (!size) + return; + + size_t end = limit && (offset + limit < size) + ? offset + limit + : size; + + ColumnArray::Offset prev_offset = offset_values[offset - 1]; + for (size_t i = offset; i < end; ++i) + { + ColumnArray::Offset current_offset = offset_values[i]; + writeBinaryLittleEndian(current_offset - prev_offset, ostr); + prev_offset = current_offset; + } + } + + void deserializeArraySizesPositionIndependent(ColumnArray & column_array, ReadBuffer & istr, UInt64 limit) + { + ColumnArray::Offsets & offset_values = column_array.getOffsets(); + size_t initial_size = offset_values.size(); + offset_values.resize(initial_size + limit); + + size_t i = initial_size; + ColumnArray::Offset current_offset = initial_size ? offset_values[initial_size - 1] : 0; + while (i < initial_size + limit && !istr.eof()) + { + ColumnArray::Offset current_size = 0; + readBinaryLittleEndian(current_size, istr); + + if (unlikely(current_size > MAX_ARRAY_SIZE)) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array size is too large: {}", current_size); + if (unlikely(__builtin_add_overflow(current_offset, current_size, ¤t_offset))) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Deserialization of array offsets will lead to overflow"); + + offset_values[i] = current_offset; + ++i; + } + + offset_values.resize(i); + } + + ColumnPtr arraySizesToOffsets(const IColumn & column) + { + const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column); + MutableColumnPtr column_offsets = column_sizes.cloneEmpty(); + + if (column_sizes.empty()) + return column_offsets; + + const auto & sizes_data = column_sizes.getData(); + auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData(); + + offsets_data.resize(sizes_data.size()); + + IColumn::Offset prev_offset = 0; + for (size_t i = 0, size = sizes_data.size(); i < size; ++i) + { + prev_offset += sizes_data[i]; + offsets_data[i] = prev_offset; + } + + return column_offsets; + } + + ColumnPtr arrayOffsetsToSizes(const IColumn & column) + { + const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column); + MutableColumnPtr column_sizes = column_offsets.cloneEmpty(); + + if (column_offsets.empty()) + return column_sizes; + + const auto & offsets_data = column_offsets.getData(); + auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData(); + + sizes_data.resize(offsets_data.size()); + + IColumn::Offset prev_offset = 0; + for (size_t i = 0, size = offsets_data.size(); i < size; ++i) + { + auto current_offset = offsets_data[i]; + sizes_data[i] = current_offset - prev_offset; + prev_offset = current_offset; + } + + return column_sizes; + } +} + +DataTypePtr SerializationArray::SubcolumnCreator::create(const DataTypePtr & prev) const +{ + return std::make_shared<DataTypeArray>(prev); +} + +SerializationPtr SerializationArray::SubcolumnCreator::create(const SerializationPtr & prev) const +{ + return std::make_shared<SerializationArray>(prev); +} + +ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) const +{ + return ColumnArray::create(prev, offsets); +} + +void SerializationArray::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_array = data.type ? &assert_cast<const DataTypeArray &>(*data.type) : nullptr; + const auto * column_array = data.column ? &assert_cast<const ColumnArray &>(*data.column) : nullptr; + auto offsets = column_array ? column_array->getOffsetsPtr() : nullptr; + + auto offsets_serialization = + std::make_shared<SerializationNamed>( + std::make_shared<SerializationNumber<UInt64>>(), + "size" + std::to_string(getArrayLevel(settings.path)), false); + + auto offsets_column = offsets && !settings.position_independent_encoding + ? arrayOffsetsToSizes(*offsets) + : offsets; + + settings.path.push_back(Substream::ArraySizes); + settings.path.back().data = SubstreamData(offsets_serialization) + .withType(type_array ? std::make_shared<DataTypeUInt64>() : nullptr) + .withColumn(std::move(offsets_column)) + .withSerializationInfo(data.serialization_info); + + callback(settings.path); + + settings.path.back() = Substream::ArrayElements; + settings.path.back().data = data; + settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets); + + auto next_data = SubstreamData(nested) + .withType(type_array ? type_array->getNestedType() : nullptr) + .withColumn(column_array ? column_array->getDataPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + nested->enumerateStreams(settings, callback, next_data); + settings.path.pop_back(); +} + +void SerializationArray::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + const auto & column_array = assert_cast<const ColumnArray &>(column); + nested->serializeBinaryBulkStatePrefix(column_array.getData(), settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::ArrayElements); + nested->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnArray & column_array = typeid_cast<const ColumnArray &>(column); + + /// First serialize array sizes. + settings.path.push_back(Substream::ArraySizes); + if (auto * stream = settings.getter(settings.path)) + { + if (settings.position_independent_encoding) + serializeArraySizesPositionIndependent(column, *stream, offset, limit); + else + SerializationNumber<ColumnArray::Offset>().serializeBinaryBulk(*column_array.getOffsetsPtr(), *stream, offset, limit); + } + + /// Then serialize contents of arrays. + settings.path.back() = Substream::ArrayElements; + const ColumnArray::Offsets & offset_values = column_array.getOffsets(); + + if (offset > offset_values.size()) + return; + + /** offset - from which array to write. + * limit - how many arrays should be written, or 0, if you write everything that is. + * end - up to which array the recorded piece ends. + * + * nested_offset - from which element of the innards to write. + * nested_limit - how many elements of the innards to write, or 0, if you write everything that is. + */ + + size_t end = std::min(offset + limit, offset_values.size()); + + size_t nested_offset = offset ? offset_values[offset - 1] : 0; + size_t nested_limit = limit + ? offset_values[end - 1] - nested_offset + : 0; + + if (limit == 0 || nested_limit) + nested->serializeBinaryBulkWithMultipleStreams(column_array.getData(), nested_offset, nested_limit, settings, state); + settings.path.pop_back(); +} + + +void SerializationArray::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnArray & column_array = typeid_cast<ColumnArray &>(*mutable_column); + settings.path.push_back(Substream::ArraySizes); + + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + column_array.getOffsetsPtr() = arraySizesToOffsets(*cached_column); + } + else if (auto * stream = settings.getter(settings.path)) + { + if (settings.position_independent_encoding) + deserializeArraySizesPositionIndependent(column_array, *stream, limit); + else + SerializationNumber<ColumnArray::Offset>().deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0); + + addToSubstreamsCache(cache, settings.path, arrayOffsetsToSizes(column_array.getOffsetsColumn())); + } + + settings.path.back() = Substream::ArrayElements; + + ColumnArray::Offsets & offset_values = column_array.getOffsets(); + ColumnPtr & nested_column = column_array.getDataPtr(); + + /// Number of values corresponding with `offset_values` must be read. + size_t last_offset = offset_values.back(); + if (last_offset < nested_column->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Nested column is longer than last offset"); + size_t nested_limit = last_offset - nested_column->size(); + + if (unlikely(nested_limit > MAX_ARRAYS_SIZE)) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array sizes are too large: {}", nested_limit); + + /// Adjust value size hint. Divide it to the average array size. + settings.avg_value_size_hint = nested_limit ? settings.avg_value_size_hint / nested_limit * offset_values.size() : 0; + + nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state, cache); + + settings.path.pop_back(); + + /// Check consistency between offsets and elements subcolumns. + /// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER. + if (!nested_column->empty() && nested_column->size() != last_offset) + throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all array values: read just {} of {}", + toString(nested_column->size()), toString(last_offset)); + + column = std::move(mutable_column); +} + + +template <typename Writer> +static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested) +{ + const ColumnArray & column_array = assert_cast<const ColumnArray &>(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeChar('[', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + write_nested(nested_column, i); + } + writeChar(']', ostr); +} + + +template <typename Reader> +static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) +{ + ColumnArray & column_array = assert_cast<ColumnArray &>(column); + ColumnArray::Offsets & offsets = column_array.getOffsets(); + + IColumn & nested_column = column_array.getData(); + + size_t size = 0; + + bool has_braces = false; + if (checkChar('[', istr)) + has_braces = true; + else if (!allow_unenclosed) + throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + + try + { + bool first = true; + while (!istr.eof() && *istr.position() != ']') + { + if (!first) + { + if (*istr.position() == ',') + ++istr.position(); + else + throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, + "Cannot read array from text, expected comma or end of array, found '{}'", + *istr.position()); + } + + first = false; + + skipWhitespaceIfAny(istr); + + if (*istr.position() == ']') + break; + + read_nested(nested_column); + ++size; + + skipWhitespaceIfAny(istr); + } + + if (has_braces) + assertChar(']', istr); + else /// If array is not enclosed in braces, we read until EOF. + assertEOF(istr); + } + catch (...) + { + if (size) + nested_column.popBack(size); + throw; + } + + offsets.push_back(offsets.back() + size); +} + + +void SerializationArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&](const IColumn & nested_column, size_t i) + { + nested->serializeTextQuoted(nested_column, i, ostr, settings); + }); +} + + +void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + deserializeTextImpl(column, istr, + [&](IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, istr, settings); + }, false); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Array"); +} + +void SerializationArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnArray & column_array = assert_cast<const ColumnArray &>(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeChar('[', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + nested->serializeTextJSON(nested_column, i, ostr, settings); + } + writeChar(']', ostr); +} + +void SerializationArray::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const +{ + const ColumnArray & column_array = assert_cast<const ColumnArray &>(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + if (offset == next_offset) + { + writeCString("[]", ostr); + return; + } + + writeCString("[\n", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeCString(",\n", ostr); + writeChar(' ', (indent + 1) * 4, ostr); + nested->serializeTextJSONPretty(nested_column, i, ostr, settings, indent + 1); + } + writeChar('\n', ostr); + writeChar(' ', indent * 4, ostr); + writeChar(']', ostr); +} + + +void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&](IColumn & nested_column) + { + if (settings.null_as_default) + SerializationNullable::deserializeTextJSONImpl(nested_column, istr, settings, nested); + else + nested->deserializeTextJSON(nested_column, istr, settings); + }, false); +} + + +void SerializationArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnArray & column_array = assert_cast<const ColumnArray &>(column); + const ColumnArray::Offsets & offsets = column_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const IColumn & nested_column = column_array.getData(); + + writeCString("<array>", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + writeCString("<elem>", ostr); + nested->serializeTextXML(nested_column, i, ostr, settings); + writeCString("</elem>", ostr); + } + writeCString("</array>", ostr); +} + + +void SerializationArray::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + /// There is no good way to serialize an array in CSV. Therefore, we serialize it into a string, and then write the resulting string in CSV. + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); +} + + +void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + + if (settings.csv.arrays_as_nested_csv) + { + deserializeTextImpl(column, rb, + [&](IColumn & nested_column) + { + nested->deserializeTextCSV(nested_column, rb, settings); + }, true); + } + else + { + deserializeTextImpl(column, rb, + [&](IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, rb, settings); + }, true); + } +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationArray.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationArray.h new file mode 100644 index 00000000000..de331169db5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationArray.h @@ -0,0 +1,84 @@ +#pragma once + +#include <DataTypes/Serializations/SimpleTextSerialization.h> + +namespace DB +{ + +class SerializationArray final : public SimpleTextSerialization +{ +private: + SerializationPtr nested; + +public: + explicit SerializationArray(const SerializationPtr & nested_) : nested(nested_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Streaming serialization of arrays is arranged in a special way: + * - elements placed in a row are written/read without array sizes; + * - the sizes are written/read in a separate stream, + * This is necessary, because when implementing nested structures, several arrays can have common sizes. + */ + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr offsets; + + explicit SubcolumnCreator(const ColumnPtr & offsets_) : offsets(offsets_) {} + + DataTypePtr create(const DataTypePtr & prev) const override; + SerializationPtr create(const SerializationPtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + }; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationBool.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationBool.cpp new file mode 100644 index 00000000000..41b5bf806e5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationBool.cpp @@ -0,0 +1,335 @@ +#include <DataTypes/Serializations/SerializationBool.h> + +#include <Columns/ColumnsNumber.h> +#include <Common/Exception.h> +#include <IO/WriteBuffer.h> +#include <IO/ReadBuffer.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/PeekableReadBuffer.h> + +#include <unordered_set> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int CANNOT_PARSE_BOOL; +} + +namespace +{ + +constexpr char str_true[5] = "true"; +constexpr char str_false[6] = "false"; + +const ColumnUInt8 * checkAndGetSerializeColumnType(const IColumn & column) +{ + const auto * col = checkAndGetColumn<ColumnUInt8>(&column); + if (!checkAndGetColumn<ColumnUInt8>(&column)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Bool type can only serialize columns of type UInt8.{}", column.getName()); + return col; +} + +ColumnUInt8 * checkAndGetDeserializeColumnType(IColumn & column) +{ + auto * col = typeid_cast<ColumnUInt8 *>(&column); + if (!checkAndGetColumn<ColumnUInt8>(&column)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Bool type can only deserialize columns of type UInt8.{}", + column.getName()); + return col; +} + +void serializeCustom(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) +{ + const auto * col = checkAndGetSerializeColumnType(column); + + if (col->getData()[row_num]) + { + writeString(settings.bool_true_representation, ostr); + } + else + { + writeString(settings.bool_false_representation, ostr); + } +} + +void serializeSimple(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) +{ + const auto * col = checkAndGetSerializeColumnType(column); + + if (col->getData()[row_num]) + ostr.write(str_true, sizeof(str_true) - 1); + else + ostr.write(str_false, sizeof(str_false) - 1); +} + +bool tryDeserializeAllVariants(ColumnUInt8 * column, ReadBuffer & istr) +{ + if (checkCharCaseInsensitive('1', istr)) + { + column->insert(true); + } + else if (checkCharCaseInsensitive('0', istr)) + { + column->insert(false); + } + /// 'True' and 'T' + else if (checkCharCaseInsensitive('t', istr)) + { + /// Check if it's just short form `T` or full form `True` + if (checkCharCaseInsensitive('r', istr)) + { + if (!checkStringCaseInsensitive("ue", istr)) + return false; + } + column->insert(true); + } + /// 'False' and 'F' + else if (checkCharCaseInsensitive('f', istr)) + { + /// Check if it's just short form `F` or full form `False` + if (checkCharCaseInsensitive('a', istr)) + { + if (!checkStringCaseInsensitive("lse", istr)) + return false; + } + column->insert(false); + } + /// 'Yes' and 'Y' + else if (checkCharCaseInsensitive('y', istr)) + { + /// Check if it's just short form `Y` or full form `Yes` + if (checkCharCaseInsensitive('e', istr)) + { + if (!checkCharCaseInsensitive('s', istr)) + return false; + } + column->insert(true); + } + /// 'No' and 'N' + else if (checkCharCaseInsensitive('n', istr)) + { + /// Check if it's just short form `N` or full form `No` + checkCharCaseInsensitive('o', istr); + column->insert(false); + } + /// 'On' and 'Off' + else if (checkCharCaseInsensitive('o', istr)) + { + if (checkCharCaseInsensitive('n', istr)) + column->insert(true); + else if (checkStringCaseInsensitive("ff", istr)) + { + column->insert(false); + } + else + return false; + } + /// 'Enable' and 'Enabled' + else if (checkStringCaseInsensitive("enable", istr)) + { + /// Check if it's 'enable' or 'enabled' + checkCharCaseInsensitive('d', istr); + column->insert(true); + } + /// 'Disable' and 'Disabled' + else if (checkStringCaseInsensitive("disable", istr)) + { + /// Check if it's 'disable' or 'disabled' + checkCharCaseInsensitive('d', istr); + column->insert(false); + } + else + { + return false; + } + + return true; +} + +void deserializeImpl( + IColumn & column, ReadBuffer & istr, const FormatSettings & settings, std::function<bool(ReadBuffer &)> check_end_of_value) +{ + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + + PeekableReadBuffer buf(istr); + buf.setCheckpoint(); + if (checkString(settings.bool_true_representation, buf) && check_end_of_value(buf)) + { + col->insert(true); + return; + } + + buf.rollbackToCheckpoint(); + if (checkString(settings.bool_false_representation, buf) && check_end_of_value(buf)) + { + col->insert(false); + buf.dropCheckpoint(); + if (buf.hasUnreadData()) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return; + } + + buf.rollbackToCheckpoint(); + if (tryDeserializeAllVariants(col, buf) && check_end_of_value(buf)) + { + buf.dropCheckpoint(); + if (buf.hasUnreadData()) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return; + } + + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " + "bool_false_representation or one of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", + String(buf.position(), std::min(10lu, buf.available())), + settings.bool_true_representation, settings.bool_false_representation); +} + +} + + +SerializationBool::SerializationBool(const SerializationPtr &nested_) + : SerializationWrapper(nested_) +{ +} + +void SerializationBool::serializeText(const IColumn & column, size_t row_num, WriteBuffer &ostr, const FormatSettings & settings) const +{ + serializeCustom(column, row_num, ostr, settings); +} + +void SerializationBool::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeCustom(column, row_num, ostr, settings); +} + +void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + +void SerializationBool::serializeTextJSON(const IColumn &column, size_t row_num, WriteBuffer &ostr, const FormatSettings &settings) const +{ + serializeSimple(column, row_num, ostr, settings); +} + +void SerializationBool::deserializeTextJSON(IColumn &column, ReadBuffer &istr, const FormatSettings &) const +{ + if (istr.eof()) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + bool value = false; + + char first_char = *istr.position(); + if (first_char == 't' || first_char == 'f') + readBoolTextWord(value, istr); + else if (first_char == '1' || first_char == '0') + readBoolText(value, istr); + else + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, + "Invalid boolean value, should be true/false, 1/0, but it starts with the '{}' character.", first_char); + + col->insert(value); +} + +void SerializationBool::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeCustom(column, row_num, ostr, settings); +} + +void SerializationBool::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + + deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; }); +} + +void SerializationBool::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeCustom(column, row_num, ostr, settings); +} + +void SerializationBool::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + + deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + +void SerializationBool::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeSimple(column, row_num, ostr, settings); +} + +void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + + auto * col = checkAndGetDeserializeColumnType(column); + + char symbol = toLowerIfAlphaASCII(*istr.position()); + switch (symbol) + { + case 't': + assertStringCaseInsensitive("true", istr); + col->insert(true); + break; + case 'f': + assertStringCaseInsensitive("false", istr); + col->insert(false); + break; + case '1': + col->insert(true); + break; + case '0': + col->insert(false); + break; + case '\'': + ++istr.position(); + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); + assertChar('\'', istr); + break; + default: + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", + String(istr.position(), std::min(10ul, istr.available()))); + } +} + +void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + + deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); +} + +void SerializationBool::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeSimple(column, row_num, ostr, settings); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationBool.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationBool.h new file mode 100644 index 00000000000..a5aa0ca80a2 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationBool.h @@ -0,0 +1,37 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationWrapper.h> +#include <Columns/ColumnsNumber.h> +#include <unordered_set> + +namespace DB +{ + +class SerializationBool final : public SerializationWrapper +{ +public: + explicit SerializationBool(const SerializationPtr & nested_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp new file mode 100644 index 00000000000..c35e1120ce8 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -0,0 +1,97 @@ +#include <DataTypes/Serializations/SerializationCustomSimpleText.h> + +#include <IO/ReadBufferFromString.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteBufferFromString.h> +#include <IO/WriteHelpers.h> + +namespace +{ + +using namespace DB; + +String serializeToString(const SerializationCustomSimpleText & domain, const IColumn & column, size_t row_num, const FormatSettings & settings) +{ + WriteBufferFromOwnString buffer; + domain.serializeText(column, row_num, buffer, settings); + + return buffer.str(); +} + +void deserializeFromString(const SerializationCustomSimpleText & domain, IColumn & column, const String & s, const FormatSettings & settings) +{ + ReadBufferFromString istr(s); + domain.deserializeText(column, istr, settings, true); +} + +} + +namespace DB +{ + +SerializationCustomSimpleText::SerializationCustomSimpleText(const SerializationPtr & nested_) + : SerializationWrapper(nested_) +{ +} + +void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readStringUntilEOF(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readEscapedString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readQuotedString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeCSVString(serializeToString(*this, column, row_num, settings), ostr); +} + +void SerializationCustomSimpleText::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readCSVString(str, istr, settings.csv); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); +} + +void SerializationCustomSimpleText::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readJSONString(str, istr); + deserializeFromString(*this, column, str, settings); +} + +void SerializationCustomSimpleText::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationCustomSimpleText.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationCustomSimpleText.h new file mode 100644 index 00000000000..21d6f8af650 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationCustomSimpleText.h @@ -0,0 +1,59 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationWrapper.h> + +namespace DB +{ + +class ReadBuffer; +class WriteBuffer; +struct FormatSettings; +class IColumn; + +/** Simple ISerialization that uses serializeText/deserializeText + * for all serialization and deserialization. */ +class SerializationCustomSimpleText : public SerializationWrapper +{ +public: + explicit SerializationCustomSimpleText(const SerializationPtr & nested_); + + // Methods that subclasses must override in order to get full serialization/deserialization support. + virtual void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override = 0; + /// whole = true means that buffer contains only one value, so we should read until EOF. + /// It's needed to check if there is garbage after parsed field. + virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; + + /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. + */ + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Text serialization with escaping but without quoting. + */ + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Text serialization as a literal that may be inserted into a query. + */ + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Text serialization for the CSV format. + */ + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + /** delimiter - the delimiter we expect when reading a string value that is not double-quoted + * (the delimiter is not consumed). + */ + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Text serialization intended for using in JSON format. + * force_quoting_64bit_integers parameter forces to brace UInt64 and Int64 types into quotes. + */ + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Text serialization for putting into the XML format. + */ + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate.cpp new file mode 100644 index 00000000000..1ed48fdd31d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate.cpp @@ -0,0 +1,88 @@ +#include <DataTypes/Serializations/SerializationDate.h> + +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> + +#include <Columns/ColumnsNumber.h> +#include <Formats/ProtobufReader.h> + +#include <Common/assert_cast.h> + +namespace DB +{ + +void SerializationDate::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeDateText(DayNum(assert_cast<const ColumnUInt16 &>(column).getData()[row_num]), ostr, time_zone); +} + +void SerializationDate::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); + if (!istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date"); +} + +void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + readDateText(x, istr, time_zone); + assert_cast<ColumnUInt16 &>(column).getData().push_back(x); +} + +void SerializationDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationDate::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDate::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + assertChar('\'', istr); + readDateText(x, istr, time_zone); + assertChar('\'', istr); + assert_cast<ColumnUInt16 &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + assertChar('"', istr); + readDateText(x, istr, time_zone); + assertChar('"', istr); + assert_cast<ColumnUInt16 &>(column).getData().push_back(x); +} + +void SerializationDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum value; + readCSV(value, istr, time_zone); + assert_cast<ColumnUInt16 &>(column).getData().push_back(value); +} + +SerializationDate::SerializationDate(const DateLUTImpl & time_zone_) : time_zone(time_zone_) +{ +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate.h new file mode 100644 index 00000000000..f751b06fba6 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate.h @@ -0,0 +1,29 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationNumber.h> +#include <Common/DateLUT.h> + +namespace DB +{ + +class SerializationDate final : public SerializationNumber<UInt16> +{ +public: + explicit SerializationDate(const DateLUTImpl & time_zone_ = DateLUT::instance()); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + +protected: + const DateLUTImpl & time_zone; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate32.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate32.cpp new file mode 100644 index 00000000000..851710de839 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate32.cpp @@ -0,0 +1,85 @@ +#include <DataTypes/Serializations/SerializationDate32.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> + +#include <Columns/ColumnsNumber.h> + +#include <Common/assert_cast.h> + +namespace DB +{ + +void SerializationDate32::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeDateText(ExtendedDayNum(assert_cast<const ColumnInt32 &>(column).getData()[row_num]), ostr, time_zone); +} + +void SerializationDate32::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); + if (!istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date32"); +} + +void SerializationDate32::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + readDateText(x, istr, time_zone); + assert_cast<ColumnInt32 &>(column).getData().push_back(x); +} + +void SerializationDate32::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationDate32::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDate32::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + assertChar('\'', istr); + readDateText(x, istr, time_zone); + assertChar('\'', istr); + assert_cast<ColumnInt32 &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDate32::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate32::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + assertChar('"', istr); + readDateText(x, istr, time_zone); + assertChar('"', istr); + assert_cast<ColumnInt32 &>(column).getData().push_back(x); +} + +void SerializationDate32::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDate32::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + LocalDate value; + readCSV(value, istr); + assert_cast<ColumnInt32 &>(column).getData().push_back(value.getExtenedDayNum()); +} + +SerializationDate32::SerializationDate32(const DateLUTImpl & time_zone_) : time_zone(time_zone_) +{ +} +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate32.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate32.h new file mode 100644 index 00000000000..49560fb6c7d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDate32.h @@ -0,0 +1,27 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationNumber.h> +#include <Common/DateLUT.h> + +namespace DB +{ +class SerializationDate32 final : public SerializationNumber<Int32> +{ +public: + explicit SerializationDate32(const DateLUTImpl & time_zone_ = DateLUT::instance()); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + +protected: + const DateLUTImpl & time_zone; +}; +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime.cpp new file mode 100644 index 00000000000..2ba24f5351b --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -0,0 +1,179 @@ +#include <DataTypes/Serializations/SerializationDateTime.h> + +#include <Columns/ColumnVector.h> +#include <Common/assert_cast.h> +#include <Common/DateLUT.h> +#include <Formats/FormatSettings.h> +#include <Formats/ProtobufReader.h> +#include <Formats/ProtobufWriter.h> +#include <IO/Operators.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/parseDateTimeBestEffort.h> +#include <IO/ReadBufferFromString.h> + +namespace DB +{ + +namespace +{ + +inline void readText(time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + readDateTimeText(x, istr, time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffort: + parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + parseDateTimeBestEffortUS(x, istr, time_zone, utc_time_zone); + return; + } +} + +} + +SerializationDateTime::SerializationDateTime(const TimezoneMixin & time_zone_) + : TimezoneMixin(time_zone_) +{ +} + +void SerializationDateTime::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto value = assert_cast<const ColumnType &>(column).getData()[row_num]; + switch (settings.date_time_output_format) + { + case FormatSettings::DateTimeOutputFormat::Simple: + writeDateTimeText(value, ostr, time_zone); + return; + case FormatSettings::DateTimeOutputFormat::UnixTimestamp: + writeIntText(value, ostr); + return; + case FormatSettings::DateTimeOutputFormat::ISO: + writeDateTimeTextISO(value, ostr, utc_time_zone); + return; + } +} + +void SerializationDateTime::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationDateTime::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); + if (!istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime"); +} + +void SerializationDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + readText(x, istr, settings, time_zone, utc_time_zone); + if (x < 0) + x = 0; + assert_cast<ColumnType &>(column).getData().push_back(static_cast<UInt32>(x)); +} + +void SerializationDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + readText(x, istr, settings, time_zone, utc_time_zone); + assertChar('\'', istr); + } + else /// Just 1504193808 or 01504193808 + { + readIntText(x, istr); + } + if (x < 0) + x = 0; + + /// It's important to do this at the end - for exception safety. + assert_cast<ColumnType &>(column).getData().push_back(static_cast<UInt32>(x)); +} + +void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('"', istr)) + { + readText(x, istr, settings, time_zone, utc_time_zone); + assertChar('"', istr); + } + else + { + readIntText(x, istr); + } + if (x < 0) + x = 0; + assert_cast<ColumnType &>(column).getData().push_back(static_cast<UInt32>(x)); +} + +void SerializationDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + + if (istr.eof()) + throwReadAfterEOF(); + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + readText(x, istr, settings, time_zone, utc_time_zone); + assertChar(maybe_quote, istr); + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + readText(x, istr, settings, time_zone, utc_time_zone); + } + /// Best effort parsing supports datetime in format like "01.01.2000, 00:00:00" + /// and can mistakenly read comma as a part of datetime. + /// For example data "...,01.01.2000,some string,..." cannot be parsed correctly. + /// To fix this problem we first read CSV string and then try to parse it as datetime. + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + readText(x, buf, settings, time_zone, utc_time_zone); + } + } + + if (x < 0) + x = 0; + + assert_cast<ColumnType &>(column).getData().push_back(static_cast<UInt32>(x)); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime.h new file mode 100644 index 00000000000..f4a142483e5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime.h @@ -0,0 +1,29 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationNumber.h> +#include <DataTypes/TimezoneMixin.h> + +class DateLUTImpl; + +namespace DB +{ + +class SerializationDateTime final : public SerializationNumber<UInt32>, public TimezoneMixin +{ +public: + explicit SerializationDateTime(const TimezoneMixin & time_zone_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} + diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime64.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime64.cpp new file mode 100644 index 00000000000..c5964f1bd97 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -0,0 +1,174 @@ +#include <DataTypes/Serializations/SerializationDateTime64.h> + +#include <Columns/ColumnVector.h> +#include <Common/assert_cast.h> +#include <Common/DateLUT.h> +#include <Formats/FormatSettings.h> +#include <Formats/ProtobufReader.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteBufferFromString.h> +#include <IO/WriteHelpers.h> +#include <IO/parseDateTimeBestEffort.h> +#include <IO/ReadBufferFromString.h> + +namespace DB +{ + +SerializationDateTime64::SerializationDateTime64( + UInt32 scale_, const TimezoneMixin & time_zone_) + : SerializationDecimalBase<DateTime64>(DecimalUtils::max_precision<DateTime64>, scale_) + , TimezoneMixin(time_zone_) +{ +} + +void SerializationDateTime64::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto value = assert_cast<const ColumnType &>(column).getData()[row_num]; + switch (settings.date_time_output_format) + { + case FormatSettings::DateTimeOutputFormat::Simple: + writeDateTimeText(value, scale, ostr, time_zone); + return; + case FormatSettings::DateTimeOutputFormat::UnixTimestamp: + writeDateTimeUnixTimestamp(value, scale, ostr); + return; + case FormatSettings::DateTimeOutputFormat::ISO: + writeDateTimeTextISO(value, scale, ostr, utc_time_zone); + return; + } +} + +void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + DateTime64 result = 0; + readDateTime64Text(result, scale, istr, time_zone); + assert_cast<ColumnType &>(column).getData().push_back(result); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime64"); +} + +void SerializationDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); + if (!istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime64"); +} + +void SerializationDateTime64::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + readDateTime64Text(x, scale, istr, time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffort: + parseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); + return; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + parseDateTime64BestEffortUS(x, scale, istr, time_zone, utc_time_zone); + return; + } +} + +void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assert_cast<ColumnType &>(column).getData().push_back(x); +} + +void SerializationDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assertChar('\'', istr); + } + else /// Just 1504193808 or 01504193808 + { + readIntText(x, istr); + } + assert_cast<ColumnType &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('"', istr)) + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assertChar('"', istr); + } + else + { + readIntText(x, istr); + } + assert_cast<ColumnType &>(column).getData().push_back(x); +} + +void SerializationDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + + if (istr.eof()) + throwReadAfterEOF(); + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + readText(x, scale, istr, settings, time_zone, utc_time_zone); + assertChar(maybe_quote, istr); + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + } + /// Best effort parsing supports datetime in format like "01.01.2000, 00:00:00" + /// and can mistakenly read comma as a part of datetime. + /// For example data "...,01.01.2000,some string,..." cannot be parsed correctly. + /// To fix this problem we first read CSV string and then try to parse it as datetime. + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + readText(x, scale, buf, settings, time_zone, utc_time_zone); + } + } + + assert_cast<ColumnType &>(column).getData().push_back(x); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime64.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime64.h new file mode 100644 index 00000000000..f817edbf0dd --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDateTime64.h @@ -0,0 +1,29 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationDecimalBase.h> +#include <DataTypes/TimezoneMixin.h> + +class DateLUTImpl; + +namespace DB +{ + +class SerializationDateTime64 final : public SerializationDecimalBase<DateTime64>, public TimezoneMixin +{ +public: + SerializationDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_); + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimal.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimal.cpp new file mode 100644 index 00000000000..9de85d338e9 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimal.cpp @@ -0,0 +1,99 @@ +#include <DataTypes/Serializations/SerializationDecimal.h> + +#include <Columns/ColumnVector.h> +#include <Common/assert_cast.h> +#include <Common/typeid_cast.h> +#include <Formats/ProtobufReader.h> +#include <Formats/ProtobufWriter.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/readDecimalText.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int DECIMAL_OVERFLOW; +} + +template <typename T> +bool SerializationDecimal<T>::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) +{ + UInt32 unread_scale = scale; + if (!tryReadDecimalText(istr, x, precision, unread_scale)) + return false; + + if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier<T>(unread_scale), x.value)) + return false; + + return true; +} + +template <typename T> +void SerializationDecimal<T>::readText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) +{ + UInt32 unread_scale = scale; + if (csv) + readCSVDecimalText(istr, x, precision, unread_scale); + else + readDecimalText(istr, x, precision, unread_scale); + + if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier<T>(unread_scale), x.value)) + throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow"); +} + +template <typename T> +void SerializationDecimal<T>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + T value = assert_cast<const ColumnType &>(column).getData()[row_num]; + writeText(value, this->scale, ostr, settings.decimal_trailing_zeros); +} + +template <typename T> +void SerializationDecimal<T>::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + T x; + readText(x, istr); + assert_cast<ColumnType &>(column).getData().push_back(x); + + if (whole && !istr.eof()) + ISerialization::throwUnexpectedDataAfterParsedValue(column, istr, settings, "Decimal"); +} + +template <typename T> +void SerializationDecimal<T>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + readText(x, istr, true); + assert_cast<ColumnType &>(column).getData().push_back(x); +} + +template <typename T> +void SerializationDecimal<T>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + if (settings.json.quote_decimals) + writeChar('"', ostr); + + serializeText(column, row_num, ostr, settings); + + if (settings.json.quote_decimals) + writeChar('"', ostr); +} + +template <typename T> +void SerializationDecimal<T>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + bool have_quotes = checkChar('"', istr); + deserializeText(column, istr, settings, false); + if (have_quotes) + assertChar('"', istr); +} + + +template class SerializationDecimal<Decimal32>; +template class SerializationDecimal<Decimal64>; +template class SerializationDecimal<Decimal128>; +template class SerializationDecimal<Decimal256>; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimal.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimal.h new file mode 100644 index 00000000000..57decdd0973 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimal.h @@ -0,0 +1,30 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationDecimalBase.h> + +namespace DB +{ + +template <typename T> +class SerializationDecimal final : public SerializationDecimalBase<T> +{ +public: + using typename SerializationDecimalBase<T>::ColumnType; + + SerializationDecimal(UInt32 precision_, UInt32 scale_) + : SerializationDecimalBase<T>(precision_, scale_) {} + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } + + static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); + static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimalBase.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimalBase.cpp new file mode 100644 index 00000000000..b7f91e6833e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimalBase.cpp @@ -0,0 +1,79 @@ +#include <DataTypes/Serializations/SerializationDecimalBase.h> + +#include <Common/assert_cast.h> +#include <Common/typeid_cast.h> +#include <Formats/ProtobufReader.h> +#include <Formats/ProtobufWriter.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> + +#include <ranges> + +namespace DB +{ + +template <typename T> +void SerializationDecimalBase<T>::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + FieldType x = field.get<DecimalField<T>>(); + writeBinaryLittleEndian(x, ostr); +} + +template <typename T> +void SerializationDecimalBase<T>::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const FieldType & x = assert_cast<const ColumnType &>(column).getElement(row_num); + writeBinaryLittleEndian(x, ostr); +} + +template <typename T> +void SerializationDecimalBase<T>::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnType::Container & x = typeid_cast<const ColumnType &>(column).getData(); + if (const size_t size = x.size(); limit == 0 || offset + limit > size) + limit = size - offset; + + if constexpr (std::endian::native == std::endian::big) + for (size_t i = offset; i < offset + limit; ++i) + writeBinaryLittleEndian(x[i], ostr); + else + ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(FieldType) * limit); +} + +template <typename T> +void SerializationDecimalBase<T>::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const +{ + typename FieldType::NativeType x; + readBinaryLittleEndian(x, istr); + field = DecimalField(T(x), this->scale); +} + +template <typename T> +void SerializationDecimalBase<T>::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + typename FieldType::NativeType x; + readBinaryLittleEndian(x, istr); + assert_cast<ColumnType &>(column).getData().push_back(FieldType(x)); +} + +template <typename T> +void SerializationDecimalBase<T>::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double) const +{ + typename ColumnType::Container & x = typeid_cast<ColumnType &>(column).getData(); + const size_t initial_size = x.size(); + x.resize(initial_size + limit); + const size_t size = istr.readBig(reinterpret_cast<char *>(&x[initial_size]), sizeof(FieldType) * limit); + x.resize(initial_size + size / sizeof(FieldType)); + + if constexpr (std::endian::native == std::endian::big) + for (size_t i = initial_size; i < x.size(); ++i) + transformEndianness<std::endian::big, std::endian::little>(x[i]); +} + +template class SerializationDecimalBase<Decimal32>; +template class SerializationDecimalBase<Decimal64>; +template class SerializationDecimalBase<Decimal128>; +template class SerializationDecimalBase<Decimal256>; +template class SerializationDecimalBase<DateTime64>; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimalBase.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimalBase.h new file mode 100644 index 00000000000..08f963cedbb --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationDecimalBase.h @@ -0,0 +1,32 @@ +#pragma once + +#include <DataTypes/Serializations/SimpleTextSerialization.h> +#include <Columns/ColumnDecimal.h> + +namespace DB +{ + +template <typename T> +class SerializationDecimalBase : public SimpleTextSerialization +{ +protected: + const UInt32 precision; + const UInt32 scale; + +public: + using FieldType = T; + using ColumnType = ColumnDecimal<T>; + + SerializationDecimalBase(UInt32 precision_, UInt32 scale_) + : precision(precision_), scale(scale_) {} + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationEnum.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationEnum.cpp new file mode 100644 index 00000000000..a1bd63d4327 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationEnum.cpp @@ -0,0 +1,117 @@ +#include <DataTypes/Serializations/SerializationEnum.h> + +#include <Columns/ColumnVector.h> +#include <Common/assert_cast.h> +#include <IO/WriteBufferFromString.h> +#include <Formats/FormatSettings.h> +#include <Formats/ProtobufReader.h> +#include <Formats/ProtobufWriter.h> + +namespace DB +{ + +template <typename Type> +void SerializationEnum<Type>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(this->getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); +} + +template <typename Type> +void SerializationEnum<Type>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(this->getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]).toView(), ostr); +} + +template <typename Type> +void SerializationEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.tsv.enum_as_number) + assert_cast<ColumnType &>(column).getData().push_back(readValue(istr)); + else + { + /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. + std::string field_name; + readEscapedString(field_name, istr); + assert_cast<ColumnType &>(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template <typename Type> +void SerializationEnum<Type>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(this->getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); +} + +template <typename Type> +void SerializationEnum<Type>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + std::string field_name; + readQuotedStringWithSQLStyle(field_name, istr); + assert_cast<ColumnType &>(column).getData().push_back(this->getValue(StringRef(field_name))); +} + +template <typename Type> +void SerializationEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.tsv.enum_as_number) + { + assert_cast<ColumnType &>(column).getData().push_back(readValue(istr)); + if (!istr.eof()) + ISerialization::throwUnexpectedDataAfterParsedValue(column, istr, settings, "Enum"); + } + else + { + std::string field_name; + readStringUntilEOF(field_name, istr); + assert_cast<ColumnType &>(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template <typename Type> +void SerializationEnum<Type>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(this->getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]).toView(), ostr, settings); +} + +template <typename Type> +void SerializationEnum<Type>::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(this->getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]).toView(), ostr); +} + +template <typename Type> +void SerializationEnum<Type>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + if (!istr.eof() && *istr.position() != '"') + assert_cast<ColumnType &>(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readJSONString(field_name, istr); + assert_cast<ColumnType &>(column).getData().push_back(this->getValue(StringRef(field_name))); + } +} + +template <typename Type> +void SerializationEnum<Type>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSVString(this->getNameForValue(assert_cast<const ColumnType &>(column).getData()[row_num]), ostr); +} + +template <typename Type> +void SerializationEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.csv.enum_as_number) + assert_cast<ColumnType &>(column).getData().push_back(readValue(istr)); + else + { + std::string field_name; + readCSVString(field_name, istr, settings.csv); + assert_cast<ColumnType &>(column).getData().push_back(this->getValue(StringRef(field_name), true)); + } +} + +template class SerializationEnum<Int8>; +template class SerializationEnum<Int16>; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationEnum.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationEnum.h new file mode 100644 index 00000000000..bdd769b59c5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationEnum.h @@ -0,0 +1,40 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationNumber.h> +#include <DataTypes/EnumValues.h> + +namespace DB +{ + +template <typename Type> +class SerializationEnum : public SerializationNumber<Type>, public EnumValues<Type> +{ +public: + using typename SerializationNumber<Type>::FieldType; + using typename SerializationNumber<Type>::ColumnType; + using typename EnumValues<Type>::Values; + + explicit SerializationEnum(const Values & values_) : EnumValues<Type>(values_) {} + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + FieldType readValue(ReadBuffer & istr) const + { + FieldType x; + readText(x, istr); + return this->findByValue(x)->first; + } +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationFixedString.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationFixedString.cpp new file mode 100644 index 00000000000..3b405f6ec08 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -0,0 +1,214 @@ +#include <DataTypes/Serializations/SerializationFixedString.h> + +#include <Columns/ColumnFixedString.h> +#include <Columns/ColumnConst.h> + +#include <Formats/FormatSettings.h> +#include <Formats/ProtobufReader.h> +#include <Formats/ProtobufWriter.h> + +#include <IO/WriteBuffer.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/VarInt.h> + +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; + extern const int TOO_LARGE_STRING_SIZE; +} + +static constexpr size_t MAX_STRINGS_SIZE = 1ULL << 30; + +void SerializationFixedString::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + const String & s = field.get<const String &>(); + ostr.write(s.data(), std::min(s.size(), n)); + if (s.size() < n) + for (size_t i = s.size(); i < n; ++i) + ostr.write(0); +} + + +void SerializationFixedString::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const +{ + field = String(); + String & s = field.get<String &>(); + s.resize(n); + istr.readStrict(s.data(), n); +} + + +void SerializationFixedString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + ostr.write(reinterpret_cast<const char *>(&assert_cast<const ColumnFixedString &>(column).getChars()[n * row_num]), n); +} + + +void SerializationFixedString::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ColumnFixedString::Chars & data = assert_cast<ColumnFixedString &>(column).getChars(); + size_t old_size = data.size(); + data.resize(old_size + n); + try + { + istr.readStrict(reinterpret_cast<char *>(data.data() + old_size), n); + } + catch (...) + { + data.resize_assume_reserved(old_size); + throw; + } +} + + +void SerializationFixedString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnFixedString::Chars & data = typeid_cast<const ColumnFixedString &>(column).getChars(); + + size_t size = data.size() / n; + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast<const char *>(&data[n * offset]), n * limit); +} + + +void SerializationFixedString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + ColumnFixedString::Chars & data = typeid_cast<ColumnFixedString &>(column).getChars(); + + size_t initial_size = data.size(); + size_t max_bytes; + size_t new_data_size; + + if (unlikely(__builtin_mul_overflow(limit, n, &max_bytes))) + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Deserializing FixedString will lead to overflow"); + if (unlikely(max_bytes > MAX_STRINGS_SIZE)) + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large sizes of FixedString to deserialize: {}", max_bytes); + if (unlikely(__builtin_add_overflow(initial_size, max_bytes, &new_data_size))) + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Deserializing FixedString will lead to overflow"); + + data.resize(new_data_size); + size_t read_bytes = istr.readBig(reinterpret_cast<char *>(&data[initial_size]), max_bytes); + + if (read_bytes % n != 0) + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all data of type FixedString. " + "Bytes read:{}. String size:{}.", read_bytes, toString(n)); + + data.resize(initial_size + read_bytes); +} + + +void SerializationFixedString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(reinterpret_cast<const char *>(&assert_cast<const ColumnFixedString &>(column).getChars()[n * row_num]), n, ostr); +} + + +void SerializationFixedString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast<const char *>(&assert_cast<const ColumnFixedString &>(column).getChars()[n * row_num]); + writeAnyEscapedString<'\''>(pos, pos + n, ostr); +} + + +void SerializationFixedString::alignStringLength(size_t n, PaddedPODArray<UInt8> & data, size_t string_start) +{ + size_t length = data.size() - string_start; + if (length < n) + { + data.resize_fill(string_start + n); + } + else if (length > n) + { + data.resize_assume_reserved(string_start); + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large value for FixedString({})", n); + } +} + +template <typename Reader> +static inline void read(const SerializationFixedString & self, IColumn & column, Reader && reader) +{ + ColumnFixedString::Chars & data = typeid_cast<ColumnFixedString &>(column).getChars(); + size_t prev_size = data.size(); + try + { + reader(data); + SerializationFixedString::alignStringLength(self.getN(), data, prev_size); + } + catch (...) + { + data.resize_assume_reserved(prev_size); + throw; + } +} + + +void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast<const char *>(&assert_cast<const ColumnFixedString &>(column).getChars()[n * row_num]); + writeAnyQuotedString<'\''>(pos, pos + n, ostr); +} + + +void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto<true>(data, istr); }); +} + + +void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const char * pos = reinterpret_cast<const char *>(&assert_cast<const ColumnFixedString &>(column).getChars()[n * row_num]); + writeJSONString(pos, pos + n, ostr, settings); +} + + +void SerializationFixedString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); +} + + +void SerializationFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast<const char *>(&assert_cast<const ColumnFixedString &>(column).getChars()[n * row_num]); + writeXMLStringForTextElement(pos, pos + n, ostr); +} + + +void SerializationFixedString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + const char * pos = reinterpret_cast<const char *>(&assert_cast<const ColumnFixedString &>(column).getChars()[n * row_num]); + writeCSVString(pos, pos + n, ostr); +} + + +void SerializationFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); +} + + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationFixedString.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationFixedString.h new file mode 100644 index 00000000000..3db31ab02cb --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationFixedString.h @@ -0,0 +1,50 @@ +#pragma once + +#include <DataTypes/Serializations/ISerialization.h> +#include <Common/PODArray.h> + +namespace DB +{ + +class SerializationFixedString : public ISerialization +{ +private: + size_t n; + +public: + explicit SerializationFixedString(size_t n_) : n(n_) {} + size_t getN() const { return n; } + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /// Makes sure that the length of a newly inserted string to `chars` is equal to getN(). + /// If the length is less than getN() the function will add zero characters up to getN(). + /// If the length is greater than getN() the function will throw an exception. + static void alignStringLength(size_t n, PaddedPODArray<UInt8> & data, size_t string_start); +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationIPv4andIPv6.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationIPv4andIPv6.h new file mode 100644 index 00000000000..61464962f1c --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationIPv4andIPv6.h @@ -0,0 +1,132 @@ +#pragma once + +#include <base/TypeName.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <Columns/ColumnsNumber.h> +#include <DataTypes/Serializations/SimpleTextSerialization.h> + +namespace DB +{ + +template <typename IPv> +class SerializationIP : public SimpleTextSerialization +{ +public: + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override + { + writeText(assert_cast<const ColumnVector<IPv> &>(column).getData()[row_num], ostr); + } + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override + { + IPv x; + readText(x, istr); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName<IPv>.data()); + + assert_cast<ColumnVector<IPv> &>(column).getData().push_back(x); + } + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + serializeText(column, row_num, ostr, settings); + } + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + deserializeText(column, istr, settings, false); + } + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); + } + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override + { + IPv x; + assertChar('\'', istr); + readText(x, istr); + assertChar('\'', istr); + assert_cast<ColumnVector<IPv> &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. + } + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); + } + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + IPv x; + assertChar('"', istr); + readText(x, istr); + /// this code looks weird, but we want to throw specific exception to match original behavior... + if (istr.eof()) + assertChar('"', istr); + if (*istr.position() != '"') + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName<IPv>.data()); + istr.ignore(); + + assert_cast<ColumnVector<IPv> &>(column).getData().push_back(x); + } + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); + } + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override + { + IPv value; + readCSV(value, istr); + + assert_cast<ColumnVector<IPv> &>(column).getData().push_back(value); + } + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override + { + IPv x = field.get<IPv>(); + writeBinary(x, ostr); + } + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override + { + IPv x; + readBinary(x.toUnderType(), istr); + field = NearestFieldType<IPv>(x); + } + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override + { + writeBinary(assert_cast<const ColumnVector<IPv> &>(column).getData()[row_num], ostr); + } + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override + { + IPv x; + readBinary(x.toUnderType(), istr); + assert_cast<ColumnVector<IPv> &>(column).getData().push_back(x); + } + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override + { + const typename ColumnVector<IPv>::Container & x = typeid_cast<const ColumnVector<IPv> &>(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(IPv) * limit); + } + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const override + { + typename ColumnVector<IPv>::Container & x = typeid_cast<ColumnVector<IPv> &>(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast<char*>(&x[initial_size]), sizeof(IPv) * limit); + x.resize(initial_size + size / sizeof(IPv)); + } +}; + +using SerializationIPv4 = SerializationIP<IPv4>; +using SerializationIPv6 = SerializationIP<IPv6>; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfo.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfo.cpp new file mode 100644 index 00000000000..4e9b9905454 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfo.cpp @@ -0,0 +1,298 @@ +#include <DataTypes/Serializations/SerializationInfo.h> +#include <Columns/ColumnSparse.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/VarInt.h> +#include <Core/Block.h> +#include <base/EnumReflection.h> + +#include <Poco/JSON/JSON.h> +#include <Poco/JSON/Object.h> +#include <Poco/JSON/Stringifier.h> +#include <Poco/JSON/Parser.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CORRUPTED_DATA; +} + +namespace +{ + +constexpr auto KEY_VERSION = "version"; +constexpr auto KEY_NUM_ROWS = "num_rows"; +constexpr auto KEY_COLUMNS = "columns"; +constexpr auto KEY_NUM_DEFAULTS = "num_defaults"; +constexpr auto KEY_KIND = "kind"; +constexpr auto KEY_NAME = "name"; + +} + +void SerializationInfo::Data::add(const IColumn & column) +{ + size_t rows = column.size(); + double ratio = column.getRatioOfDefaultRows(ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO); + + num_rows += rows; + num_defaults += static_cast<size_t>(ratio * rows); +} + +void SerializationInfo::Data::add(const Data & other) +{ + num_rows += other.num_rows; + num_defaults += other.num_defaults; +} + +void SerializationInfo::Data::addDefaults(size_t length) +{ + num_rows += length; + num_defaults += length; +} + +SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_) + : settings(settings_) + , kind(kind_) +{ +} + +SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_, const Data & data_) + : settings(settings_) + , kind(kind_) + , data(data_) +{ +} + +void SerializationInfo::add(const IColumn & column) +{ + data.add(column); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::add(const SerializationInfo & other) +{ + data.add(other.data); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::addDefaults(size_t length) +{ + data.addDefaults(length); + if (settings.choose_kind) + kind = chooseKind(data, settings); +} + +void SerializationInfo::replaceData(const SerializationInfo & other) +{ + data = other.data; +} + +MutableSerializationInfoPtr SerializationInfo::clone() const +{ + return std::make_shared<SerializationInfo>(kind, settings, data); +} + +/// Returns true if all rows with default values of type 'lhs' +/// are mapped to default values of type 'rhs' after conversion. +static bool preserveDefaultsAfterConversion(const IDataType & lhs, const IDataType & rhs) +{ + if (lhs.equals(rhs)) + return true; + + bool lhs_is_columned_as_numeric = isColumnedAsNumber(lhs) || isColumnedAsDecimal(lhs); + bool rhs_is_columned_as_numeric = isColumnedAsNumber(rhs) || isColumnedAsDecimal(rhs); + + if (lhs_is_columned_as_numeric && rhs_is_columned_as_numeric) + return true; + + if (isStringOrFixedString(lhs) && isStringOrFixedString(rhs)) + return true; + + return false; +} + +std::shared_ptr<SerializationInfo> SerializationInfo::createWithType( + const IDataType & old_type, + const IDataType & new_type, + const Settings & new_settings) const +{ + auto new_kind = kind; + if (new_kind == ISerialization::Kind::SPARSE) + { + if (!new_type.supportsSparseSerialization() + || !preserveDefaultsAfterConversion(old_type, new_type)) + new_kind = ISerialization::Kind::DEFAULT; + } + + return std::make_shared<SerializationInfo>(new_kind, new_settings); +} + +void SerializationInfo::serialializeKindBinary(WriteBuffer & out) const +{ + writeBinary(static_cast<UInt8>(kind), out); +} + +void SerializationInfo::deserializeFromKindsBinary(ReadBuffer & in) +{ + UInt8 kind_num; + readBinary(kind_num, in); + auto maybe_kind = magic_enum::enum_cast<ISerialization::Kind>(kind_num); + if (!maybe_kind) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Unknown serialization kind {}", std::to_string(kind_num)); + + kind = *maybe_kind; +} + +Poco::JSON::Object SerializationInfo::toJSON() const +{ + Poco::JSON::Object object; + object.set(KEY_KIND, ISerialization::kindToString(kind)); + object.set(KEY_NUM_DEFAULTS, data.num_defaults); + object.set(KEY_NUM_ROWS, data.num_rows); + return object; +} + +void SerializationInfo::fromJSON(const Poco::JSON::Object & object) +{ + if (!object.has(KEY_KIND) || !object.has(KEY_NUM_DEFAULTS) || !object.has(KEY_NUM_ROWS)) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field '{}' or '{}' or '{}' in SerializationInfo of columns", + KEY_KIND, KEY_NUM_DEFAULTS, KEY_NUM_ROWS); + + data.num_rows = object.getValue<size_t>(KEY_NUM_ROWS); + data.num_defaults = object.getValue<size_t>(KEY_NUM_DEFAULTS); + kind = ISerialization::stringToKind(object.getValue<String>(KEY_KIND)); +} + +ISerialization::Kind SerializationInfo::chooseKind(const Data & data, const Settings & settings) +{ + double ratio = data.num_rows ? std::min(static_cast<double>(data.num_defaults) / data.num_rows, 1.0) : 0.0; + return ratio > settings.ratio_of_defaults_for_sparse ? ISerialization::Kind::SPARSE : ISerialization::Kind::DEFAULT; +} + +SerializationInfoByName::SerializationInfoByName( + const NamesAndTypesList & columns, + const SerializationInfo::Settings & settings) +{ + if (settings.isAlwaysDefault()) + return; + + for (const auto & column : columns) + if (column.type->supportsSparseSerialization()) + emplace(column.name, column.type->createSerializationInfo(settings)); +} + +void SerializationInfoByName::add(const Block & block) +{ + for (const auto & column : block) + { + auto it = find(column.name); + if (it == end()) + continue; + + it->second->add(*column.column); + } +} + +void SerializationInfoByName::add(const SerializationInfoByName & other) +{ + for (const auto & [name, info] : other) + { + auto it = find(name); + if (it == end()) + continue; + + it->second->add(*info); + } +} + +void SerializationInfoByName::replaceData(const SerializationInfoByName & other) +{ + for (const auto & [name, new_info] : other) + { + auto & old_info = (*this)[name]; + + if (old_info) + old_info->replaceData(*new_info); + else + old_info = new_info->clone(); + } +} + +void SerializationInfoByName::writeJSON(WriteBuffer & out) const +{ + Poco::JSON::Object object; + object.set(KEY_VERSION, SERIALIZATION_INFO_VERSION); + + Poco::JSON::Array column_infos; + for (const auto & [name, info] : *this) + { + auto info_json = info->toJSON(); + info_json.set(KEY_NAME, name); + column_infos.add(std::move(info_json)); /// NOLINT + } + + object.set(KEY_COLUMNS, std::move(column_infos)); /// NOLINT + + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(object, oss); + + return writeString(oss.str(), out); +} + +SerializationInfoByName SerializationInfoByName::readJSON( + const NamesAndTypesList & columns, const Settings & settings, ReadBuffer & in) +{ + String json_str; + readString(json_str, in); + + Poco::JSON::Parser parser; + auto object = parser.parse(json_str).extract<Poco::JSON::Object::Ptr>(); + + if (!object->has(KEY_VERSION)) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Missed version of serialization infos"); + + if (object->getValue<size_t>(KEY_VERSION) > SERIALIZATION_INFO_VERSION) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Unknown version of serialization infos ({}). Should be less or equal than {}", + object->getValue<size_t>(KEY_VERSION), SERIALIZATION_INFO_VERSION); + + SerializationInfoByName infos; + if (object->has(KEY_COLUMNS)) + { + std::unordered_map<std::string_view, const IDataType *> column_type_by_name; + for (const auto & [name, type] : columns) + column_type_by_name.emplace(name, type.get()); + + auto array = object->getArray(KEY_COLUMNS); + for (const auto & elem : *array) + { + auto elem_object = elem.extract<Poco::JSON::Object::Ptr>(); + + if (!elem_object->has(KEY_NAME)) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field '{}' in serialization infos", KEY_NAME); + + auto name = elem_object->getValue<String>(KEY_NAME); + auto it = column_type_by_name.find(name); + + if (it == column_type_by_name.end()) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Found unexpected column '{}' in serialization infos", name); + + auto info = it->second->createSerializationInfo(settings); + info->fromJSON(*elem_object); + infos.emplace(name, std::move(info)); + } + } + + return infos; +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfo.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfo.h new file mode 100644 index 00000000000..3d8f4f1d00c --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfo.h @@ -0,0 +1,118 @@ +#pragma once + +#include <Core/Types_fwd.h> +#include <DataTypes/Serializations/ISerialization.h> +#include <Poco/JSON/Object.h> + + +namespace DB +{ + +class ReadBuffer; +class ReadBuffer; +class WriteBuffer; +class NamesAndTypesList; +class Block; + +constexpr auto SERIALIZATION_INFO_VERSION = 0; + +/** Contains information about kind of serialization of column and its subcolumns. + * Also contains information about content of columns, + * that helps to choose kind of serialization of column. + * + * Currently has only information about number of default rows, + * that helps to choose sparse serialization. + * + * Should be extended, when new kinds of serialization will be implemented. + */ +class SerializationInfo +{ +public: + struct Data + { + size_t num_rows = 0; + size_t num_defaults = 0; + + void add(const IColumn & column); + void add(const Data & other); + void addDefaults(size_t length); + }; + + struct Settings + { + const double ratio_of_defaults_for_sparse = 1.0; + const bool choose_kind = false; + + bool isAlwaysDefault() const { return ratio_of_defaults_for_sparse >= 1.0; } + }; + + SerializationInfo(ISerialization::Kind kind_, const Settings & settings_); + SerializationInfo(ISerialization::Kind kind_, const Settings & settings_, const Data & data_); + + virtual ~SerializationInfo() = default; + + virtual bool hasCustomSerialization() const { return kind != ISerialization::Kind::DEFAULT; } + virtual bool structureEquals(const SerializationInfo & rhs) const { return typeid(SerializationInfo) == typeid(rhs); } + + virtual void add(const IColumn & column); + virtual void add(const SerializationInfo & other); + virtual void addDefaults(size_t length); + virtual void replaceData(const SerializationInfo & other); + + virtual std::shared_ptr<SerializationInfo> clone() const; + + virtual std::shared_ptr<SerializationInfo> createWithType( + const IDataType & old_type, + const IDataType & new_type, + const Settings & new_settings) const; + + virtual void serialializeKindBinary(WriteBuffer & out) const; + virtual void deserializeFromKindsBinary(ReadBuffer & in); + + virtual Poco::JSON::Object toJSON() const; + virtual void fromJSON(const Poco::JSON::Object & object); + + void setKind(ISerialization::Kind kind_) { kind = kind_; } + const Settings & getSettings() const { return settings; } + const Data & getData() const { return data; } + ISerialization::Kind getKind() const { return kind; } + + static ISerialization::Kind chooseKind(const Data & data, const Settings & settings); + +protected: + const Settings settings; + + ISerialization::Kind kind; + Data data; +}; + +using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>; +using MutableSerializationInfoPtr = std::shared_ptr<SerializationInfo>; + +using SerializationInfos = std::vector<SerializationInfoPtr>; +using MutableSerializationInfos = std::vector<MutableSerializationInfoPtr>; + +/// The order is important because info is serialized to part metadata. +class SerializationInfoByName : public std::map<String, MutableSerializationInfoPtr> +{ +public: + using Settings = SerializationInfo::Settings; + + SerializationInfoByName() = default; + SerializationInfoByName(const NamesAndTypesList & columns, const Settings & settings); + + void add(const Block & block); + void add(const SerializationInfoByName & other); + + /// Takes data from @other, but keeps current serialization kinds. + /// If column exists in @other infos, but not in current infos, + /// it's cloned to current infos. + void replaceData(const SerializationInfoByName & other); + + void writeJSON(WriteBuffer & out) const; + + static SerializationInfoByName readJSON( + const NamesAndTypesList & columns, const Settings & settings, ReadBuffer & in); +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfoTuple.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfoTuple.cpp new file mode 100644 index 00000000000..d36668f03b6 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfoTuple.cpp @@ -0,0 +1,165 @@ +#include <DataTypes/Serializations/SerializationInfoTuple.h> +#include <DataTypes/DataTypeTuple.h> +#include <Columns/ColumnTuple.h> +#include <Common/assert_cast.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CORRUPTED_DATA; + extern const int THERE_IS_NO_COLUMN; +} + +SerializationInfoTuple::SerializationInfoTuple( + MutableSerializationInfos elems_, Names names_, const Settings & settings_) + : SerializationInfo(ISerialization::Kind::DEFAULT, settings_) + , elems(std::move(elems_)) + , names(std::move(names_)) +{ + assert(names.size() == elems.size()); + for (size_t i = 0; i < names.size(); ++i) + name_to_elem[names[i]] = elems[i]; +} + +bool SerializationInfoTuple::hasCustomSerialization() const +{ + return std::any_of(elems.begin(), elems.end(), [](const auto & elem) { return elem->hasCustomSerialization(); }); +} + +bool SerializationInfoTuple::structureEquals(const SerializationInfo & rhs) const +{ + const auto * rhs_tuple = typeid_cast<const SerializationInfoTuple *>(&rhs); + if (!rhs_tuple || elems.size() != rhs_tuple->elems.size()) + return false; + + for (size_t i = 0; i < elems.size(); ++i) + if (!elems[i]->structureEquals(*rhs_tuple->elems[i])) + return false; + + return true; +} + +void SerializationInfoTuple::add(const IColumn & column) +{ + SerializationInfo::add(column); + + const auto & column_tuple = assert_cast<const ColumnTuple &>(column); + const auto & right_elems = column_tuple.getColumns(); + assert(elems.size() == right_elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->add(*right_elems[i]); +} + +void SerializationInfoTuple::add(const SerializationInfo & other) +{ + SerializationInfo::add(other); + + const auto & other_info = assert_cast<const SerializationInfoTuple &>(other); + for (const auto & [name, elem] : name_to_elem) + { + auto it = other_info.name_to_elem.find(name); + if (it != other_info.name_to_elem.end()) + elem->add(*it->second); + else + elem->addDefaults(other_info.getData().num_rows); + } +} + +void SerializationInfoTuple::addDefaults(size_t length) +{ + for (const auto & elem : elems) + elem->addDefaults(length); +} + +void SerializationInfoTuple::replaceData(const SerializationInfo & other) +{ + SerializationInfo::add(other); + + const auto & other_info = assert_cast<const SerializationInfoTuple &>(other); + for (const auto & [name, elem] : name_to_elem) + { + auto it = other_info.name_to_elem.find(name); + if (it != other_info.name_to_elem.end()) + elem->replaceData(*it->second); + } +} + +MutableSerializationInfoPtr SerializationInfoTuple::clone() const +{ + MutableSerializationInfos elems_cloned; + elems_cloned.reserve(elems.size()); + for (const auto & elem : elems) + elems_cloned.push_back(elem->clone()); + + return std::make_shared<SerializationInfoTuple>(std::move(elems_cloned), names, settings); +} + +MutableSerializationInfoPtr SerializationInfoTuple::createWithType( + const IDataType & old_type, + const IDataType & new_type, + const Settings & new_settings) const +{ + const auto & old_tuple = assert_cast<const DataTypeTuple &>(old_type); + const auto & new_tuple = assert_cast<const DataTypeTuple &>(new_type); + + const auto & old_elements = old_tuple.getElements(); + const auto & new_elements = new_tuple.getElements(); + + assert(elems.size() == old_elements.size()); + assert(elems.size() == new_elements.size()); + + MutableSerializationInfos infos; + infos.reserve(elems.size()); + for (size_t i = 0; i < elems.size(); ++i) + infos.push_back(elems[i]->createWithType(*old_elements[i], *new_elements[i], new_settings)); + + return std::make_shared<SerializationInfoTuple>(std::move(infos), names, new_settings); +} + +void SerializationInfoTuple::serialializeKindBinary(WriteBuffer & out) const +{ + SerializationInfo::serialializeKindBinary(out); + for (const auto & elem : elems) + elem->serialializeKindBinary(out); +} + +void SerializationInfoTuple::deserializeFromKindsBinary(ReadBuffer & in) +{ + SerializationInfo::deserializeFromKindsBinary(in); + for (const auto & elem : elems) + elem->deserializeFromKindsBinary(in); +} + +Poco::JSON::Object SerializationInfoTuple::toJSON() const +{ + auto object = SerializationInfo::toJSON(); + Poco::JSON::Array subcolumns; + for (const auto & elem : elems) + subcolumns.add(elem->toJSON()); + + object.set("subcolumns", subcolumns); + return object; +} + +void SerializationInfoTuple::fromJSON(const Poco::JSON::Object & object) +{ + SerializationInfo::fromJSON(object); + + if (!object.has("subcolumns")) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Missed field 'subcolumns' in SerializationInfo of columns SerializationInfoTuple"); + + auto subcolumns = object.getArray("subcolumns"); + if (elems.size() != subcolumns->size()) + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, + "Mismatched number of subcolumns between JSON and SerializationInfoTuple." + "Expected: {}, got: {}", elems.size(), subcolumns->size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->fromJSON(*subcolumns->getObject(static_cast<unsigned>(i))); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfoTuple.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfoTuple.h new file mode 100644 index 00000000000..a9f3bdb6c6e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInfoTuple.h @@ -0,0 +1,45 @@ +#pragma once +#include <Core/Names.h> +#include <DataTypes/Serializations/SerializationInfo.h> + +namespace DB +{ + +class SerializationInfoTuple : public SerializationInfo +{ +public: + SerializationInfoTuple(MutableSerializationInfos elems_, Names names_, const Settings & settings_); + + bool hasCustomSerialization() const override; + bool structureEquals(const SerializationInfo & rhs) const override; + + void add(const IColumn & column) override; + void add(const SerializationInfo & other) override; + void addDefaults(size_t length) override; + void replaceData(const SerializationInfo & other) override; + + MutableSerializationInfoPtr clone() const override; + + MutableSerializationInfoPtr createWithType( + const IDataType & old_type, + const IDataType & new_type, + const Settings & new_settings) const override; + + void serialializeKindBinary(WriteBuffer & out) const override; + void deserializeFromKindsBinary(ReadBuffer & in) override; + + Poco::JSON::Object toJSON() const override; + void fromJSON(const Poco::JSON::Object & object) override; + + const MutableSerializationInfoPtr & getElementInfo(size_t i) const { return elems[i]; } + ISerialization::Kind getElementKind(size_t i) const { return elems[i]->getKind(); } + +private: + MutableSerializationInfos elems; + Names names; + + using NameToElem = std::unordered_map<String, MutableSerializationInfoPtr>; + NameToElem name_to_elem; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationInterval.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInterval.cpp new file mode 100644 index 00000000000..59086d8aef3 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInterval.cpp @@ -0,0 +1,209 @@ +#include "SerializationInterval.h" + +#include <Columns/ColumnsNumber.h> +#include <IO/WriteBuffer.h> +#include <Parsers/Kusto/Formatters.h> + +namespace DB +{ +using ColumnInterval = DataTypeInterval::ColumnType; + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int NOT_IMPLEMENTED; +} + +void SerializationKustoInterval::serializeText( + const IColumn & column, const size_t row, WriteBuffer & ostr, const FormatSettings &) const +{ + const auto * interval_column = checkAndGetColumn<ColumnInterval>(column); + if (!interval_column) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Expected column of underlying type of Interval"); + + const auto & value = interval_column->getData()[row]; + const auto ticks = kind.toAvgNanoseconds() * value / 100; + const auto interval_as_string = formatKQLTimespan(ticks); + ostr.write(interval_as_string.c_str(), interval_as_string.length()); +} + +void SerializationKustoInterval::deserializeText( + [[maybe_unused]] IColumn & column, + [[maybe_unused]] ReadBuffer & istr, + [[maybe_unused]] const FormatSettings & settings, + [[maybe_unused]] const bool whole) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Deserialization is not implemented for {}", kind.toNameOfFunctionToIntervalDataType()); +} + +SerializationInterval::SerializationInterval(IntervalKind interval_kind_) : interval_kind(std::move(interval_kind_)) +{ +} + +void SerializationInterval::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch( + static_cast<void (ISerialization::*)(Field &, ReadBuffer &, const FormatSettings &) const>(&ISerialization::deserializeBinary), + settings.interval.output_format, + field, + istr, + settings); +} + +void SerializationInterval::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch( + static_cast<void (ISerialization::*)(IColumn &, ReadBuffer &, const FormatSettings &) const>(&ISerialization::deserializeBinary), + settings.interval.output_format, + column, + istr, + settings); +} + +void SerializationInterval::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const +{ + dispatch( + &ISerialization::deserializeBinaryBulk, FormatSettings::IntervalOutputFormat::Numeric, column, istr, limit, avg_value_size_hint); +} + +void SerializationInterval::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const +{ + dispatch(&ISerialization::deserializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, settings, state); +} + + +void SerializationInterval::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + dispatch( + &ISerialization::deserializeBinaryBulkWithMultipleStreams, + FormatSettings::IntervalOutputFormat::Numeric, + column, + limit, + settings, + state, + cache); +} + + +void SerializationInterval::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::deserializeTextCSV, settings.interval.output_format, column, istr, settings); +} + +void SerializationInterval::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::deserializeTextEscaped, settings.interval.output_format, column, istr, settings); +} + +void SerializationInterval::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::deserializeTextJSON, settings.interval.output_format, column, istr, settings); +} + +void SerializationInterval::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::deserializeTextQuoted, settings.interval.output_format, column, istr, settings); +} + +void SerializationInterval::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::deserializeTextRaw, settings.interval.output_format, column, istr, settings); +} + + +void SerializationInterval::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::deserializeWholeText, settings.interval.output_format, column, istr, settings); +} + +void SerializationInterval::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch( + static_cast<void (ISerialization::*)(const Field &, WriteBuffer &, const FormatSettings &) const>(&ISerialization::serializeBinary), + settings.interval.output_format, + field, + ostr, + settings); +} + +void SerializationInterval::serializeBinary(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch( + static_cast<void (ISerialization::*)(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const>( + &ISerialization::serializeBinary), + settings.interval.output_format, + column, + row, + ostr, + settings); +} + +void SerializationInterval::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + dispatch(&ISerialization::serializeBinaryBulk, FormatSettings::IntervalOutputFormat::Numeric, column, ostr, offset, limit); +} + +void SerializationInterval::serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const +{ + dispatch(&ISerialization::serializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, column, settings, state); +} + +void SerializationInterval::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const +{ + dispatch(&ISerialization::serializeBinaryBulkStateSuffix, FormatSettings::IntervalOutputFormat::Numeric, settings, state); +} + +void SerializationInterval::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, size_t offset, size_t limit, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const +{ + dispatch( + &ISerialization::serializeBinaryBulkWithMultipleStreams, + FormatSettings::IntervalOutputFormat::Numeric, + column, + offset, + limit, + settings, + state); +} + +void SerializationInterval::serializeText(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::serializeText, settings.interval.output_format, column, row, ostr, settings); +} + +void SerializationInterval::serializeTextCSV(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::serializeTextCSV, settings.interval.output_format, column, row, ostr, settings); +} + +void SerializationInterval::serializeTextEscaped( + const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::serializeTextEscaped, settings.interval.output_format, column, row, ostr, settings); +} + +void SerializationInterval::serializeTextJSON(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::serializeTextJSON, settings.interval.output_format, column, row, ostr, settings); +} + +void SerializationInterval::serializeTextQuoted( + const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::serializeTextQuoted, settings.interval.output_format, column, row, ostr, settings); +} + +void SerializationInterval::serializeTextRaw(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dispatch(&ISerialization::serializeTextRaw, settings.interval.output_format, column, row, ostr, settings); +} +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationInterval.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInterval.h new file mode 100644 index 00000000000..a4e6c204e4f --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationInterval.h @@ -0,0 +1,90 @@ +#pragma once + +#include "ISerialization.h" +#include "SerializationCustomSimpleText.h" + +#include <DataTypes/DataTypeInterval.h> +#include <Formats/FormatSettings.h> +#include <Common/IntervalKind.h> + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class SerializationKustoInterval : public SerializationCustomSimpleText +{ +public: + explicit SerializationKustoInterval(IntervalKind kind_) : SerializationCustomSimpleText(nullptr), kind(kind_) { } + + void serializeText(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + +private: + IntervalKind kind; +}; + +class SerializationInterval : public ISerialization +{ +public: + explicit SerializationInterval(IntervalKind kind_); + + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + void deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const override; + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeBinary(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void serializeBinaryBulkStatePrefix( + const IColumn & column, SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; + void serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + void serializeText(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextCSV(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextRaw(const IColumn & column, size_t row, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + template <typename... Args, std::invocable<const ISerialization *, Args...> Method> + void dispatch(const Method method, const FormatSettings::IntervalOutputFormat format, Args &&... args) const + { + const ISerialization * serialization = nullptr; + if (format == FormatSettings::IntervalOutputFormat::Kusto) + serialization = &serialization_kusto; + else if (format == FormatSettings::IntervalOutputFormat::Numeric) + serialization = &serialization_numeric; + + if (!serialization) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Option {} is not implemented", magic_enum::enum_name(format)); + + (serialization->*method)(std::forward<Args>(args)...); + } + + IntervalKind interval_kind; + SerializationKustoInterval serialization_kusto{interval_kind}; + SerializationNumber<typename DataTypeInterval::FieldType> serialization_numeric; +}; +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationLowCardinality.cpp new file mode 100644 index 00000000000..3e1cbdb00f5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -0,0 +1,781 @@ +#include <DataTypes/Serializations/SerializationLowCardinality.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypesNumber.h> + +#include <Columns/ColumnLowCardinality.h> +#include <Columns/ColumnUnique.h> +#include <Columns/ColumnFixedString.h> +#include <Columns/ColumnsCommon.h> +#include <Common/HashTable/HashMap.h> +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> +#include <Core/Field.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int INCORRECT_DATA; +} + +namespace +{ + const ColumnLowCardinality & getColumnLowCardinality(const IColumn & column) + { + return typeid_cast<const ColumnLowCardinality &>(column); + } + + ColumnLowCardinality & getColumnLowCardinality(IColumn & column) + { + return typeid_cast<ColumnLowCardinality &>(column); + } +} + +SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dictionary_type_) + : dictionary_type(dictionary_type_) + , dict_inner_serialization(removeNullable(dictionary_type_)->getDefaultSerialization()) +{ +} + +void SerializationLowCardinality::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr; + + settings.path.push_back(Substream::DictionaryKeys); + auto dict_data = SubstreamData(dict_inner_serialization) + .withType(data.type ? dictionary_type : nullptr) + .withColumn(column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = dict_data; + dict_inner_serialization->enumerateStreams(settings, callback, dict_data); + + settings.path.back() = Substream::DictionaryIndexes; + settings.path.back().data = data; + + callback(settings.path); + settings.path.pop_back(); +} + +struct KeysSerializationVersion +{ + enum Value + { + /// Version is written at the start of <name.dict.bin>. + /// Dictionary is written as number N and N keys after them. + /// Dictionary can be shared for continuous range of granules, so some marks may point to the same position. + /// Shared dictionary is stored in state and is read once. + SharedDictionariesWithAdditionalKeys = 1, + }; + + Value value; + + static void checkVersion(UInt64 version) + { + if (version != SharedDictionariesWithAdditionalKeys) + throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid version for SerializationLowCardinality key column."); + } + + explicit KeysSerializationVersion(UInt64 version) : value(static_cast<Value>(version)) { checkVersion(version); } +}; + +/// Version is stored at the start of each granule. It's used to store indexes type and flags. +struct IndexesSerializationType +{ + using SerializationType = UInt64; + /// Need to read dictionary if it wasn't. + static constexpr SerializationType NeedGlobalDictionaryBit = 1u << 8u; + /// Need to read additional keys. Additional keys are stored before indexes as value N and N keys after them. + static constexpr SerializationType HasAdditionalKeysBit = 1u << 9u; + /// Need to update dictionary. It means that previous granule has different dictionary. + static constexpr SerializationType NeedUpdateDictionary = 1u << 10u; + + enum Type + { + TUInt8 = 0, + TUInt16, + TUInt32, + TUInt64, + }; + + Type type; + bool has_additional_keys; + bool need_global_dictionary; + bool need_update_dictionary; + + static constexpr SerializationType resetFlags(SerializationType type) + { + return type & (~(HasAdditionalKeysBit | NeedGlobalDictionaryBit | NeedUpdateDictionary)); + } + + static void checkType(SerializationType type) + { + UInt64 value = resetFlags(type); + if (value <= TUInt64) + return; + + throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid type for SerializationLowCardinality index column."); + } + + void serialize(WriteBuffer & buffer) const + { + SerializationType val = type; + if (has_additional_keys) + val |= HasAdditionalKeysBit; + if (need_global_dictionary) + val |= NeedGlobalDictionaryBit; + if (need_update_dictionary) + val |= NeedUpdateDictionary; + writeBinaryLittleEndian(val, buffer); + } + + void deserialize(ReadBuffer & buffer, const ISerialization::DeserializeBinaryBulkSettings & settings) + { + SerializationType val; + readBinaryLittleEndian(val, buffer); + + checkType(val); + has_additional_keys = (val & HasAdditionalKeysBit) != 0; + need_global_dictionary = (val & NeedGlobalDictionaryBit) != 0; + need_update_dictionary = (val & NeedUpdateDictionary) != 0; + type = static_cast<Type>(resetFlags(val)); + + if (settings.native_format) + { + if (need_global_dictionary) + throw Exception(ErrorCodes::INCORRECT_DATA, + "LowCardinality indexes serialization type for Native format " + "cannot use global dictionary"); + } + } + + IndexesSerializationType(const IColumn & column, + bool has_additional_keys_, + bool need_global_dictionary_, + bool enumerate_dictionaries) + : has_additional_keys(has_additional_keys_) + , need_global_dictionary(need_global_dictionary_) + , need_update_dictionary(enumerate_dictionaries) + { + if (typeid_cast<const ColumnUInt8 *>(&column)) + type = TUInt8; + else if (typeid_cast<const ColumnUInt16 *>(&column)) + type = TUInt16; + else if (typeid_cast<const ColumnUInt32 *>(&column)) + type = TUInt32; + else if (typeid_cast<const ColumnUInt64 *>(&column)) + type = TUInt64; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid Indexes column for IndexesSerializationType. " + "Expected ColumnUInt*, got {}", column.getName()); + } + + DataTypePtr getDataType() const + { + if (type == TUInt8) + return std::make_shared<DataTypeUInt8>(); + if (type == TUInt16) + return std::make_shared<DataTypeUInt16>(); + if (type == TUInt32) + return std::make_shared<DataTypeUInt32>(); + if (type == TUInt64) + return std::make_shared<DataTypeUInt64>(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't create DataType from IndexesSerializationType."); + } + + IndexesSerializationType() = default; +}; + +struct SerializeStateLowCardinality : public ISerialization::SerializeBinaryBulkState +{ + KeysSerializationVersion key_version; + MutableColumnUniquePtr shared_dictionary; + + explicit SerializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} +}; + +struct DeserializeStateLowCardinality : public ISerialization::DeserializeBinaryBulkState +{ + KeysSerializationVersion key_version; + ColumnUniquePtr global_dictionary; + + IndexesSerializationType index_type; + ColumnPtr additional_keys; + ColumnPtr null_map; + UInt64 num_pending_rows = 0; + + /// If dictionary should be updated. + /// Can happen is some granules was skipped while reading from MergeTree. + /// We should store this flag in State because + /// in case of long block of empty arrays we may not need read dictionary at first reading. + bool need_update_dictionary = false; + + explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {} +}; + +void SerializationLowCardinality::serializeBinaryBulkStatePrefix( + const IColumn & /*column*/, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream in SerializationLowCardinality::serializeBinaryBulkStatePrefix"); + + /// Write version and create SerializeBinaryBulkState. + UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys; + + writeBinaryLittleEndian(key_version, *stream); + + state = std::make_shared<SerializeStateLowCardinality>(key_version); +} + +void SerializationLowCardinality::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * low_cardinality_state = checkAndGetState<SerializeStateLowCardinality>(state); + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size) + { + auto nested_column = low_cardinality_state->shared_dictionary->getNestedNotNullableColumn(); + + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream in SerializationLowCardinality::serializeBinaryBulkStateSuffix"); + + UInt64 num_keys = nested_column->size(); + writeBinaryLittleEndian(num_keys, *stream); + dict_inner_serialization->serializeBinaryBulk(*nested_column, *stream, 0, num_keys); + low_cardinality_state->shared_dictionary = nullptr; + } +} + +void SerializationLowCardinality::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + return; + + UInt64 keys_version; + readBinaryLittleEndian(keys_version, *stream); + + state = std::make_shared<DeserializeStateLowCardinality>(keys_version); +} + +namespace +{ + template <typename T> + PaddedPODArray<T> * getIndexesData(IColumn & indexes) + { + auto * column = typeid_cast<ColumnVector<T> *>(&indexes); + if (column) + return &column->getData(); + + return nullptr; + } + + struct IndexMapsWithAdditionalKeys + { + MutableColumnPtr dictionary_map; + MutableColumnPtr additional_keys_map; + }; + + template <typename T> + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(PaddedPODArray<T> & index, size_t dict_size) + { + T max_less_dict_size = 0; + T max_value = 0; + + auto size = index.size(); + if (size == 0) + return {ColumnVector<T>::create(), ColumnVector<T>::create()}; + + for (size_t i = 0; i < size; ++i) + { + auto val = index[i]; + if (val < dict_size) + max_less_dict_size = std::max(max_less_dict_size, val); + + max_value = std::max(max_value, val); + } + + auto map_size = UInt64(max_less_dict_size) + 1; + auto overflow_map_size = max_value >= dict_size ? (UInt64(max_value - dict_size) + 1) : 0; + PaddedPODArray<T> map(map_size, 0); + PaddedPODArray<T> overflow_map(overflow_map_size, 0); + + T zero_pos_value = 0; + T zero_pos_overflowed_value = 0; + UInt64 cur_pos = 0; + UInt64 cur_overflowed_pos = 0; + + for (size_t i = 0; i < size; ++i) + { + T val = index[i]; + if (val < dict_size) + { + if (cur_pos == 0) + { + zero_pos_value = val; + ++cur_pos; + } + else if (map[val] == 0 && val != zero_pos_value) + { + map[val] = static_cast<T>(cur_pos); + ++cur_pos; + } + } + else + { + T shifted_val = static_cast<T>(val - dict_size); + if (cur_overflowed_pos == 0) + { + zero_pos_overflowed_value = shifted_val; + ++cur_overflowed_pos; + } + else if (overflow_map[shifted_val] == 0 && shifted_val != zero_pos_overflowed_value) + { + overflow_map[shifted_val] = static_cast<T>(cur_overflowed_pos); + ++cur_overflowed_pos; + } + } + } + + auto dictionary_map = ColumnVector<T>::create(cur_pos); + auto additional_keys_map = ColumnVector<T>::create(cur_overflowed_pos); + auto & dict_data = dictionary_map->getData(); + auto & add_keys_data = additional_keys_map->getData(); + + for (size_t i = 0; i < map_size; ++i) + if (map[i]) + dict_data[map[i]] = static_cast<T>(i); + + for (size_t i = 0; i < overflow_map_size; ++i) + if (overflow_map[i]) + add_keys_data[overflow_map[i]] = static_cast<T>(i); + + if (cur_pos) + dict_data[0] = zero_pos_value; + if (cur_overflowed_pos) + add_keys_data[0] = zero_pos_overflowed_value; + + for (size_t i = 0; i < size; ++i) + { + T & val = index[i]; + if (val < dict_size) + val = map[val]; + else + val = overflow_map[val - dict_size] + static_cast<T>(cur_pos); + } + + return {std::move(dictionary_map), std::move(additional_keys_map)}; + } + + /// Update column and return map with old indexes. + /// Let N is the number of distinct values which are less than max_size; + /// old_column - column before function call; + /// new_column - column after function call: + /// * if old_column[i] < max_size, than + /// dictionary_map[new_column[i]] = old_column[i] + /// * else + /// additional_keys_map[new_column[i]] = old_column[i] - dict_size + N + IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(IColumn & column, size_t dict_size) + { + if (auto * data_uint8 = getIndexesData<UInt8>(column)) + return mapIndexWithAdditionalKeys(*data_uint8, dict_size); + else if (auto * data_uint16 = getIndexesData<UInt16>(column)) + return mapIndexWithAdditionalKeys(*data_uint16, dict_size); + else if (auto * data_uint32 = getIndexesData<UInt32>(column)) + return mapIndexWithAdditionalKeys(*data_uint32, dict_size); + else if (auto * data_uint64 = getIndexesData<UInt64>(column)) + return mapIndexWithAdditionalKeys(*data_uint64, dict_size); + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Indexes column for mapIndexWithAdditionalKeys must be UInt, got {}", + column.getName()); + } +} + +void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::DictionaryKeys); + auto * keys_stream = settings.getter(settings.path); + settings.path.back() = Substream::DictionaryIndexes; + auto * indexes_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!keys_stream && !indexes_stream) + return; + + if (!keys_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for SerializationLowCardinality keys."); + + if (!indexes_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for SerializationLowCardinality indexes."); + + const ColumnLowCardinality & low_cardinality_column = typeid_cast<const ColumnLowCardinality &>(column); + + auto * low_cardinality_state = checkAndGetState<SerializeStateLowCardinality>(state); + auto & global_dictionary = low_cardinality_state->shared_dictionary; + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + bool need_update_dictionary = global_dictionary == nullptr; + if (need_update_dictionary) + global_dictionary = DataTypeLowCardinality::createColumnUnique(*dictionary_type); + + size_t max_limit = column.size() - offset; + limit = limit ? std::min(limit, max_limit) : max_limit; + + /// Do not write anything for empty column. (May happen while writing empty arrays.) + if (limit == 0) + return; + + auto sub_column = low_cardinality_column.cutAndCompact(offset, limit); + ColumnPtr positions = sub_column->getIndexesPtr(); + ColumnPtr keys = sub_column->getDictionary().getNestedColumn(); + + if (settings.low_cardinality_max_dictionary_size) + { + /// Insert used_keys into global dictionary and update sub_index. + auto indexes_with_overflow = global_dictionary->uniqueInsertRangeWithOverflow(*keys, 0, keys->size(), + settings.low_cardinality_max_dictionary_size); + + if (global_dictionary->size() > settings.low_cardinality_max_dictionary_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got dictionary with size {} but max dictionary size is {}", + global_dictionary->size(), settings.low_cardinality_max_dictionary_size); + + positions = indexes_with_overflow.indexes->index(*positions, 0); + keys = std::move(indexes_with_overflow.overflowed_keys); + + if (global_dictionary->size() < settings.low_cardinality_max_dictionary_size && !keys->empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Has additional keys, but dict size is {} which is less " + "then max dictionary size ({})", global_dictionary->size(), + settings.low_cardinality_max_dictionary_size); + } + + if (const auto * nullable_keys = checkAndGetColumn<ColumnNullable>(*keys)) + keys = nullable_keys->getNestedColumnPtr(); + + bool need_additional_keys = !keys->empty(); + bool need_dictionary = settings.low_cardinality_max_dictionary_size != 0; + bool need_write_dictionary = !settings.low_cardinality_use_single_dictionary_for_part + && global_dictionary->size() >= settings.low_cardinality_max_dictionary_size; + + IndexesSerializationType index_version(*positions, need_additional_keys, need_dictionary, need_update_dictionary); + index_version.serialize(*indexes_stream); + + if (need_write_dictionary) + { + const auto & nested_column = global_dictionary->getNestedNotNullableColumn(); + UInt64 num_keys = nested_column->size(); + writeBinaryLittleEndian(num_keys, *keys_stream); + dict_inner_serialization->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys); + low_cardinality_state->shared_dictionary = nullptr; + } + + if (need_additional_keys) + { + UInt64 num_keys = keys->size(); + writeBinaryLittleEndian(num_keys, *indexes_stream); + dict_inner_serialization->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys); + } + + UInt64 num_rows = positions->size(); + writeBinaryLittleEndian(num_rows, *indexes_stream); + auto index_serialization = index_version.getDataType()->getDefaultSerialization(); + index_serialization->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows); +} + +void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * /* cache */) const +{ + auto mutable_column = column->assumeMutable(); + ColumnLowCardinality & low_cardinality_column = typeid_cast<ColumnLowCardinality &>(*mutable_column); + + settings.path.push_back(Substream::DictionaryKeys); + auto * keys_stream = settings.getter(settings.path); + settings.path.back() = Substream::DictionaryIndexes; + auto * indexes_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!keys_stream && !indexes_stream) + return; + + if (!keys_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for SerializationLowCardinality keys."); + + if (!indexes_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for SerializationLowCardinality indexes."); + + auto * low_cardinality_state = checkAndGetState<DeserializeStateLowCardinality>(state); + KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value); + + auto read_dictionary = [this, low_cardinality_state, keys_stream]() + { + UInt64 num_keys; + readBinaryLittleEndian(num_keys, *keys_stream); + + auto keys_type = removeNullable(dictionary_type); + auto global_dict_keys = keys_type->createColumn(); + dict_inner_serialization->deserializeBinaryBulk(*global_dict_keys, *keys_stream, num_keys, 0); + + auto column_unique = DataTypeLowCardinality::createColumnUnique(*dictionary_type, std::move(global_dict_keys)); + low_cardinality_state->global_dictionary = std::move(column_unique); + }; + + auto read_additional_keys = [this, low_cardinality_state, indexes_stream]() + { + UInt64 num_keys; + readBinaryLittleEndian(num_keys, *indexes_stream); + + auto keys_type = removeNullable(dictionary_type); + auto additional_keys = keys_type->createColumn(); + dict_inner_serialization->deserializeBinaryBulk(*additional_keys, *indexes_stream, num_keys, 0); + low_cardinality_state->additional_keys = std::move(additional_keys); + + if (!low_cardinality_state->index_type.need_global_dictionary && dictionary_type->isNullable()) + { + auto null_map = ColumnUInt8::create(num_keys, 0); + if (num_keys) + null_map->getElement(0) = 1; + + low_cardinality_state->null_map = std::move(null_map); + } + }; + + auto read_indexes = [this, low_cardinality_state, indexes_stream, &low_cardinality_column](UInt64 num_rows) + { + auto indexes_type = low_cardinality_state->index_type.getDataType(); + MutableColumnPtr indexes_column = indexes_type->createColumn(); + indexes_type->getDefaultSerialization()->deserializeBinaryBulk(*indexes_column, *indexes_stream, num_rows, 0); + + auto & global_dictionary = low_cardinality_state->global_dictionary; + const auto & additional_keys = low_cardinality_state->additional_keys; + + bool has_additional_keys = low_cardinality_state->index_type.has_additional_keys; + bool column_is_empty = low_cardinality_column.empty(); + + if (!low_cardinality_state->index_type.need_global_dictionary) + { + if (additional_keys == nullptr) + throw Exception(ErrorCodes::INCORRECT_DATA, "No additional keys found."); + + ColumnPtr keys_column = additional_keys; + if (low_cardinality_state->null_map) + keys_column = ColumnNullable::create(additional_keys, low_cardinality_state->null_map); + low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*keys_column, *indexes_column); + } + else if (!has_additional_keys) + { + if (column_is_empty) + low_cardinality_column.setSharedDictionary(global_dictionary); + + auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column)); + low_cardinality_column.insertRangeFrom(*local_column, 0, num_rows); + } + else + { + auto maps = mapIndexWithAdditionalKeys(*indexes_column, global_dictionary->size()); + + auto used_keys = IColumn::mutate(global_dictionary->getNestedColumn()->index(*maps.dictionary_map, 0)); + + if (!maps.additional_keys_map->empty()) + { + if (additional_keys == nullptr) + throw Exception(ErrorCodes::INCORRECT_DATA, "No additional keys found."); + + auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0); + + if (dictionary_type->isNullable()) + { + ColumnPtr null_map = ColumnUInt8::create(used_add_keys->size(), 0); + used_add_keys = ColumnNullable::create(used_add_keys, null_map); + } + + used_keys->insertRangeFrom(*used_add_keys, 0, used_add_keys->size()); + } + + low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*used_keys, *indexes_column); + } + }; + + if (!settings.continuous_reading) + { + low_cardinality_state->num_pending_rows = 0; + + /// Remember in state that some granules were skipped and we need to update dictionary. + low_cardinality_state->need_update_dictionary = true; + } + + while (limit) + { + if (low_cardinality_state->num_pending_rows == 0) + { + if (indexes_stream->eof()) + break; + + auto & index_type = low_cardinality_state->index_type; + auto & global_dictionary = low_cardinality_state->global_dictionary; + + index_type.deserialize(*indexes_stream, settings); + + bool need_update_dictionary = + !global_dictionary || index_type.need_update_dictionary || low_cardinality_state->need_update_dictionary; + if (index_type.need_global_dictionary && need_update_dictionary) + { + read_dictionary(); + low_cardinality_state->need_update_dictionary = false; + } + + if (low_cardinality_state->index_type.has_additional_keys) + read_additional_keys(); + else + low_cardinality_state->additional_keys = nullptr; + + readBinaryLittleEndian(low_cardinality_state->num_pending_rows, *indexes_stream); + } + + size_t num_rows_to_read = std::min<UInt64>(limit, low_cardinality_state->num_pending_rows); + read_indexes(num_rows_to_read); + limit -= num_rows_to_read; + low_cardinality_state->num_pending_rows -= num_rows_to_read; + } + + column = std::move(mutable_column); +} + +void SerializationLowCardinality::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + dictionary_type->getDefaultSerialization()->serializeBinary(field, ostr, settings); +} +void SerializationLowCardinality::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + dictionary_type->getDefaultSerialization()->deserializeBinary(field, istr, settings); +} + +void SerializationLowCardinality::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeBinary, ostr, settings); +} +void SerializationLowCardinality::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeBinary, istr, settings); +} + +void SerializationLowCardinality::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextEscaped, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextEscaped, istr, settings); +} + +void SerializationLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextQuoted, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextQuoted, istr, settings); +} + +void SerializationLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeWholeText, istr, settings); +} + +void SerializationLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextCSV, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextCSV, istr, settings); +} + +void SerializationLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeText, ostr, settings); +} + +void SerializationLowCardinality::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextJSON, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings); +} + +void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings); +} + +void SerializationLowCardinality::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextRaw, istr, settings); +} + +void SerializationLowCardinality::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextRaw, ostr, settings); +} + +template <typename... Params, typename... Args> +void SerializationLowCardinality::serializeImpl( + const IColumn & column, size_t row_num, SerializationLowCardinality::SerializeFunctionPtr<Params...> func, Args &&... args) const +{ + const auto & low_cardinality_column = getColumnLowCardinality(column); + size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(row_num); + auto serialization = dictionary_type->getDefaultSerialization(); + (serialization.get()->*func)(*low_cardinality_column.getDictionary().getNestedColumn(), unique_row_number, std::forward<Args>(args)...); +} + +template <typename... Params, typename... Args> +void SerializationLowCardinality::deserializeImpl( + IColumn & column, SerializationLowCardinality::DeserializeFunctionPtr<Params...> func, Args &&... args) const +{ + auto & low_cardinality_column= getColumnLowCardinality(column); + auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + + auto serialization = dictionary_type->getDefaultSerialization(); + (serialization.get()->*func)(*temp_column, std::forward<Args>(args)...); + + low_cardinality_column.insertFromFullColumn(*temp_column, 0); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationLowCardinality.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationLowCardinality.h new file mode 100644 index 00000000000..5f56bcf8108 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -0,0 +1,84 @@ +#pragma once + +#include <DataTypes/Serializations/ISerialization.h> + +namespace DB +{ + +class IDataType; +using DataTypePtr = std::shared_ptr<const IDataType>; + +class SerializationLowCardinality : public ISerialization +{ +private: + DataTypePtr dictionary_type; + SerializationPtr dict_inner_serialization; + +public: + explicit SerializationLowCardinality(const DataTypePtr & dictionary_type); + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + template <typename ... Params> + using SerializeFunctionPtr = void (ISerialization::*)(const IColumn &, size_t, Params ...) const; + + template <typename... Params, typename... Args> + void serializeImpl(const IColumn & column, size_t row_num, SerializeFunctionPtr<Params...> func, Args &&... args) const; + + template <typename ... Params> + using DeserializeFunctionPtr = void (ISerialization::*)(IColumn &, Params ...) const; + + template <typename ... Params, typename... Args> + void deserializeImpl(IColumn & column, DeserializeFunctionPtr<Params...> func, Args &&... args) const; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationMap.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationMap.cpp new file mode 100644 index 00000000000..af1d96c4ca7 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationMap.cpp @@ -0,0 +1,365 @@ +#include <DataTypes/Serializations/SerializationMap.h> +#include <DataTypes/Serializations/SerializationNullable.h> +#include <DataTypes/DataTypeMap.h> + +#include <Common/StringUtils/StringUtils.h> +#include <Columns/ColumnMap.h> +#include <Core/Field.h> +#include <Formats/FormatSettings.h> +#include <Common/assert_cast.h> +#include <Common/quoteString.h> +#include <IO/WriteHelpers.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteBufferFromString.h> +#include <IO/ReadBufferFromString.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_MAP_FROM_TEXT; + extern const int TOO_LARGE_ARRAY_SIZE; +} + +SerializationMap::SerializationMap(const SerializationPtr & key_, const SerializationPtr & value_, const SerializationPtr & nested_) + : key(key_), value(value_), nested(nested_) +{ +} + +static const IColumn & extractNestedColumn(const IColumn & column) +{ + return assert_cast<const ColumnMap &>(column).getNestedColumn(); +} + +static IColumn & extractNestedColumn(IColumn & column) +{ + return assert_cast<ColumnMap &>(column).getNestedColumn(); +} + +void SerializationMap::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & map = field.get<const Map &>(); + writeVarUInt(map.size(), ostr); + for (const auto & elem : map) + { + const auto & tuple = elem.safeGet<const Tuple>(); + assert(tuple.size() == 2); + key->serializeBinary(tuple[0], ostr, settings); + value->serializeBinary(tuple[1], ostr, settings); + } +} + +void SerializationMap::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + size_t size; + readVarUInt(size, istr); + if (settings.max_binary_array_size && size > settings.max_binary_array_size) + throw Exception( + ErrorCodes::TOO_LARGE_ARRAY_SIZE, + "Too large map size: {}. The maximum is: {}. To increase the maximum, use setting " + "format_binary_max_array_size", + size, + settings.max_binary_array_size); + field = Map(); + Map & map = field.get<Map &>(); + map.reserve(size); + for (size_t i = 0; i < size; ++i) + { + Tuple tuple(2); + key->deserializeBinary(tuple[0], istr, settings); + value->deserializeBinary(tuple[1], istr, settings); + map.push_back(std::move(tuple)); + } +} + +void SerializationMap::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested->serializeBinary(extractNestedColumn(column), row_num, ostr, settings); +} + +void SerializationMap::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested->deserializeBinary(extractNestedColumn(column), istr, settings); +} + + +template <typename KeyWriter, typename ValueWriter> +void SerializationMap::serializeTextImpl( + const IColumn & column, + size_t row_num, + WriteBuffer & ostr, + KeyWriter && key_writer, + ValueWriter && value_writer) const +{ + const auto & column_map = assert_cast<const ColumnMap &>(column); + + const auto & nested_array = column_map.getNestedColumn(); + const auto & nested_tuple = column_map.getNestedData(); + const auto & offsets = nested_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + writeChar('{', ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeChar(',', ostr); + + key_writer(ostr, key, nested_tuple.getColumn(0), i); + writeChar(':', ostr); + value_writer(ostr, value, nested_tuple.getColumn(1), i); + } + writeChar('}', ostr); +} + +template <typename Reader> +void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const +{ + auto & column_map = assert_cast<ColumnMap &>(column); + + auto & nested_array = column_map.getNestedColumn(); + auto & nested_tuple = column_map.getNestedData(); + auto & offsets = nested_array.getOffsets(); + + auto & key_column = nested_tuple.getColumn(0); + auto & value_column = nested_tuple.getColumn(1); + + size_t size = 0; + assertChar('{', istr); + + try + { + bool first = true; + while (!istr.eof() && *istr.position() != '}') + { + if (!first) + { + if (*istr.position() == ',') + ++istr.position(); + else + throw Exception(ErrorCodes::CANNOT_READ_MAP_FROM_TEXT, "Cannot read Map from text"); + } + + first = false; + + skipWhitespaceIfAny(istr); + + if (*istr.position() == '}') + break; + + reader(istr, key, key_column); + ++size; + + skipWhitespaceIfAny(istr); + assertChar(':', istr); + skipWhitespaceIfAny(istr); + + reader(istr, value, value_column); + + skipWhitespaceIfAny(istr); + } + + assertChar('}', istr); + } + catch (...) + { + if (size) + { + nested_tuple.getColumnPtr(0) = key_column.cut(0, offsets.back()); + nested_tuple.getColumnPtr(1) = value_column.cut(0, offsets.back()); + } + throw; + } + + offsets.push_back(offsets.back() + size); +} + +void SerializationMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto writer = [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) + { + subcolumn_serialization->serializeTextQuoted(subcolumn, pos, buf, settings); + }; + + serializeTextImpl(column, row_num, ostr, writer, writer); +} + +void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + deserializeTextImpl(column, istr, + [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + subcolumn_serialization->deserializeTextQuoted(subcolumn, buf, settings); + }); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Map"); +} + +void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, + [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) + { + /// We need to double-quote all keys (including integers) to produce valid JSON. + WriteBufferFromOwnString str_buf; + subcolumn_serialization->serializeText(subcolumn, pos, str_buf, settings); + writeJSONString(str_buf.str(), buf, settings); + }, + [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) + { + subcolumn_serialization->serializeTextJSON(subcolumn, pos, buf, settings); + }); +} + +void SerializationMap::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const +{ + const auto & column_map = assert_cast<const ColumnMap &>(column); + + const auto & nested_array = column_map.getNestedColumn(); + const auto & nested_tuple = column_map.getNestedData(); + const auto & offsets = nested_array.getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + if (offset == next_offset) + { + writeCString("{}", ostr); + return; + } + + writeCString("{\n", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + if (i != offset) + writeCString(",\n", ostr); + + WriteBufferFromOwnString str_buf; + key->serializeText(nested_tuple.getColumn(0), i, str_buf, settings); + + writeChar(' ', (indent + 1) * 4, ostr); + writeJSONString(str_buf.str(), ostr, settings); + writeCString(": ", ostr); + value->serializeTextJSONPretty(nested_tuple.getColumn(1), i, ostr, settings, indent + 1); + } + writeChar('\n', ostr); + writeChar(' ', indent * 4, ostr); + writeChar('}', ostr); +} + + +void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, istr, + [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default) + SerializationNullable::deserializeTextJSONImpl(subcolumn, buf, settings, subcolumn_serialization); + else + subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); + }); +} + +void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_map = assert_cast<const ColumnMap &>(column); + const auto & offsets = column_map.getNestedColumn().getOffsets(); + + size_t offset = offsets[row_num - 1]; + size_t next_offset = offsets[row_num]; + + const auto & nested_data = column_map.getNestedData(); + + writeCString("<map>", ostr); + for (size_t i = offset; i < next_offset; ++i) + { + writeCString("<elem>", ostr); + writeCString("<key>", ostr); + key->serializeTextXML(nested_data.getColumn(0), i, ostr, settings); + writeCString("</key>", ostr); + + writeCString("<value>", ostr); + value->serializeTextXML(nested_data.getColumn(1), i, ostr, settings); + writeCString("</value>", ostr); + writeCString("</elem>", ostr); + } + writeCString("</map>", ostr); +} + +void SerializationMap::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); +} + +void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + deserializeText(column, rb, settings, true); +} + +void SerializationMap::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + auto next_data = SubstreamData(nested) + .withType(data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr) + .withColumn(data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + nested->enumerateStreams(settings, callback, next_data); +} + +void SerializationMap::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkStatePrefix(extractNestedColumn(column), settings, state); +} + +void SerializationMap::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkStateSuffix(settings, state); +} + +void SerializationMap::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + nested->deserializeBinaryBulkStatePrefix(settings, state); +} + + +void SerializationMap::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested->serializeBinaryBulkWithMultipleStreams(extractNestedColumn(column), offset, limit, settings, state); +} + +void SerializationMap::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto & column_map = assert_cast<ColumnMap &>(*column->assumeMutable()); + nested->deserializeBinaryBulkWithMultipleStreams(column_map.getNestedColumnPtr(), limit, settings, state, cache); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationMap.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationMap.h new file mode 100644 index 00000000000..f32c656757d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationMap.h @@ -0,0 +1,76 @@ +#pragma once + +#include <DataTypes/Serializations/SimpleTextSerialization.h> + + +namespace DB +{ + +class SerializationMap final : public SimpleTextSerialization +{ +private: + SerializationPtr key; + SerializationPtr value; + + /// 'nested' is an Array(Tuple(key_type, value_type)) + SerializationPtr nested; + +public: + SerializationMap(const SerializationPtr & key_type_, const SerializationPtr & value_type_, const SerializationPtr & nested_); + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + template <typename KeyWriter, typename ValueWriter> + void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const; + + template <typename Reader> + void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; +}; + +} + diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNamed.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNamed.cpp new file mode 100644 index 00000000000..ca60948ce68 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNamed.cpp @@ -0,0 +1,78 @@ +#include <DataTypes/Serializations/SerializationNamed.h> + +namespace DB +{ + +void SerializationNamed::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + addToPath(settings.path); + settings.path.back().data = data; + settings.path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter); + + nested_serialization->enumerateStreams(settings, callback, data); + settings.path.pop_back(); +} + +void SerializationNamed::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStatePrefix(column, settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + addToPath(settings.path); + nested_serialization->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); + settings.path.pop_back(); +} + +void SerializationNamed::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + addToPath(settings.path); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); + settings.path.pop_back(); +} + +void SerializationNamed::addToPath(SubstreamPath & path) const +{ + path.push_back(Substream::TupleElement); + path.back().tuple_element_name = name; + path.back().escape_tuple_delimiter = escape_delimiter; +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNamed.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNamed.h new file mode 100644 index 00000000000..52bbb039442 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNamed.h @@ -0,0 +1,80 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationWrapper.h> + +namespace DB +{ + +/// Serialization wrapper that acts like nested serialization, +/// but adds a passed name to the substream path like the +/// read column was the tuple element with this name. +/// It's used while reading subcolumns of complex types. +/// In particular while reading components of named tuples. +class SerializationNamed final : public SerializationWrapper +{ +private: + String name; + bool escape_delimiter; + +public: + SerializationNamed(const SerializationPtr & nested_, const String & name_, bool escape_delimiter_ = true) + : SerializationWrapper(nested_) + , name(name_), escape_delimiter(escape_delimiter_) + { + } + + const String & getElementName() const { return name; } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const String name; + const bool escape_delimiter; + + SubcolumnCreator(const String & name_, bool escape_delimiter_) + : name(name_), escape_delimiter(escape_delimiter_) {} + + DataTypePtr create(const DataTypePtr & prev) const override { return prev; } + ColumnPtr create(const ColumnPtr & prev) const override { return prev; } + SerializationPtr create(const SerializationPtr & prev) const override + { + return std::make_shared<SerializationNamed>(prev, name, escape_delimiter); + } + }; + + void addToPath(SubstreamPath & path) const; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNothing.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNothing.cpp new file mode 100644 index 00000000000..6b11ea6d252 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNothing.cpp @@ -0,0 +1,25 @@ +#include <DataTypes/Serializations/SerializationNothing.h> +#include <Columns/ColumnNothing.h> +#include <IO/ReadBuffer.h> +#include <IO/WriteBuffer.h> + +namespace DB +{ + +void SerializationNothing::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + size_t size = column.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + for (size_t i = 0; i < limit; ++i) + ostr.write('0'); +} + +void SerializationNothing::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + typeid_cast<ColumnNothing &>(column).addSize(istr.tryIgnore(limit)); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNothing.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNothing.h new file mode 100644 index 00000000000..02974d1ca76 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNothing.h @@ -0,0 +1,34 @@ +#pragma once + +#include <DataTypes/Serializations/SimpleTextSerialization.h> +#include <Common/Exception.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class SerializationNothing : public SimpleTextSerialization +{ +private: + [[noreturn]] static void throwNoSerialization() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Serialization is not implemented for type Nothing"); + } +public: + void serializeBinary(const Field &, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + void deserializeBinary(Field &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + void serializeBinary(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + void deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } + void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); } + + /// These methods read and write zero bytes just to allow to figure out size of column. + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNullable.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNullable.cpp new file mode 100644 index 00000000000..774b86472be --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNullable.cpp @@ -0,0 +1,670 @@ +#include <DataTypes/Serializations/SerializationNullable.h> +#include <DataTypes/Serializations/SerializationNumber.h> +#include <DataTypes/Serializations/SerializationNamed.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypesNumber.h> + +#include <Columns/ColumnNullable.h> +#include <Core/Field.h> +#include <IO/ReadBuffer.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteBuffer.h> +#include <IO/WriteHelpers.h> +#include <IO/PeekableReadBuffer.h> +#include <Common/assert_cast.h> +#include <base/scope_guard.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_READ_ALL_DATA; +} + +DataTypePtr SerializationNullable::SubcolumnCreator::create(const DataTypePtr & prev) const +{ + return std::make_shared<DataTypeNullable>(prev); +} + +SerializationPtr SerializationNullable::SubcolumnCreator::create(const SerializationPtr & prev) const +{ + return std::make_shared<SerializationNullable>(prev); +} + +ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev) const +{ + return ColumnNullable::create(prev, null_map); +} + +void SerializationNullable::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_nullable = data.type ? &assert_cast<const DataTypeNullable &>(*data.type) : nullptr; + const auto * column_nullable = data.column ? &assert_cast<const ColumnNullable &>(*data.column) : nullptr; + + auto null_map_serialization = std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false); + + settings.path.push_back(Substream::NullMap); + auto null_map_data = SubstreamData(null_map_serialization) + .withType(type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr) + .withColumn(column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = null_map_data; + callback(settings.path); + + settings.path.back() = Substream::NullableElements; + settings.path.back().creator = std::make_shared<SubcolumnCreator>(null_map_data.column); + settings.path.back().data = data; + + auto next_data = SubstreamData(nested) + .withType(type_nullable ? type_nullable->getNestedType() : nullptr) + .withColumn(column_nullable ? column_nullable->getNestedColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + nested->enumerateStreams(settings, callback, next_data); + settings.path.pop_back(); +} + +void SerializationNullable::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + const auto & column_nullable = assert_cast<const ColumnNullable &>(column); + nested->serializeBinaryBulkStatePrefix(column_nullable.getNestedColumn(), settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::NullableElements); + nested->deserializeBinaryBulkStatePrefix(settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + col.checkConsistency(); + + /// First serialize null map. + settings.path.push_back(Substream::NullMap); + if (auto * stream = settings.getter(settings.path)) + SerializationNumber<UInt8>().serializeBinaryBulk(col.getNullMapColumn(), *stream, offset, limit); + + /// Then serialize contents of arrays. + settings.path.back() = Substream::NullableElements; + nested->serializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), offset, limit, settings, state); + settings.path.pop_back(); +} + + +void SerializationNullable::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnNullable & col = assert_cast<ColumnNullable &>(*mutable_column); + + settings.path.push_back(Substream::NullMap); + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + col.getNullMapColumnPtr() = cached_column; + } + else if (auto * stream = settings.getter(settings.path)) + { + SerializationNumber<UInt8>().deserializeBinaryBulk(col.getNullMapColumn(), *stream, limit, 0); + addToSubstreamsCache(cache, settings.path, col.getNullMapColumnPtr()); + } + + settings.path.back() = Substream::NullableElements; + nested->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumnPtr(), limit, settings, state, cache); + settings.path.pop_back(); +} + + +void SerializationNullable::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + if (field.isNull()) + { + writeBinary(true, ostr); + } + else + { + writeBinary(false, ostr); + nested->serializeBinary(field, ostr, settings); + } +} + +void SerializationNullable::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + bool is_null = false; + readBinary(is_null, istr); + if (!is_null) + { + nested->deserializeBinary(field, istr, settings); + } + else + { + field = Null(); + } +} + +void SerializationNullable::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + bool is_null = col.isNullAt(row_num); + writeBinary(is_null, ostr); + if (!is_null) + nested->serializeBinary(col.getNestedColumn(), row_num, ostr, settings); +} + +/// Deserialize value into ColumnNullable. +/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. +template <typename ReturnType = void, typename CheckForNull, typename DeserializeNested, ReturnType * = nullptr> +requires std::same_as<ReturnType, void> +static ReturnType +safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +{ + ColumnNullable & col = assert_cast<ColumnNullable &>(column); + + if (check_for_null()) + { + col.insertDefault(); + } + else + { + deserialize_nested(col.getNestedColumn()); + + try + { + col.getNullMapData().push_back(0); + } + catch (...) + { + col.getNestedColumn().popBack(1); + throw; + } + } +} + +/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. +template <typename ReturnType = void, typename CheckForNull, typename DeserializeNested, ReturnType * = nullptr> +requires std::same_as<ReturnType, bool> +static ReturnType +safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +{ + bool insert_default = check_for_null(); + if (insert_default) + column.insertDefault(); + else + deserialize_nested(column); + return !insert_default; +} + + +void SerializationNullable::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + safeDeserialize(column, *nested, + [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, + [this, &istr, settings] (IColumn & nested_column) { nested->deserializeBinary(nested_column, istr, settings); }); +} + + +void SerializationNullable::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + if (col.isNullAt(row_num)) + writeString(settings.tsv.null_representation, ostr); + else + nested->serializeTextEscaped(col.getNestedColumn(), row_num, ostr, settings); +} + + +void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextEscapedImpl<void>(column, istr, settings, nested); +} + +void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextRawImpl<void>(column, istr, settings, nested); +} + +void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + if (col.isNullAt(row_num)) + writeString(settings.tsv.null_representation, ostr); + else + nested->serializeTextRaw(col.getNestedColumn(), row_num, ostr, settings); +} + +template<typename ReturnType> +ReturnType SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) +{ + return deserializeTextEscapedAndRawImpl<ReturnType, false>(column, istr, settings, nested); +} + +template<typename ReturnType> +ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return deserializeTextEscapedAndRawImpl<ReturnType, true>(column, istr, settings, nested); +} + +template<typename ReturnType, bool escaped> +ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested_serialization) +{ + const String & null_representation = settings.tsv.null_representation; + + /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. + if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) + { + /// This is not null, surely. + return safeDeserialize<ReturnType>(column, *nested_serialization, + [] { return false; }, + [&nested_serialization, &istr, &settings] (IColumn & nested_column) + { + if constexpr (escaped) + nested_serialization->deserializeTextEscaped(nested_column, istr, settings); + else + nested_serialization->deserializeTextRaw(nested_column, istr, settings); + }); + } + + /// Check if we have enough data in buffer to check if it's a null. + if (istr.available() > null_representation.size()) + { + auto check_for_null = [&istr, &null_representation]() + { + auto * pos = istr.position(); + if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n')) + return true; + istr.position() = pos; + return false; + }; + auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) + { + if constexpr (escaped) + nested_serialization->deserializeTextEscaped(nested_column, istr, settings); + else + nested_serialization->deserializeTextRaw(nested_column, istr, settings); + }; + return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested); + } + + /// We don't have enough data in buffer to check if it's a null. + /// Use PeekableReadBuffer to make a checkpoint before checking null + /// representation and rollback if check was failed. + PeekableReadBuffer buf(istr, true); + auto check_for_null = [&buf, &null_representation]() + { + buf.setCheckpoint(); + SCOPE_EXIT(buf.dropCheckpoint()); + if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) + return true; + + buf.rollbackToCheckpoint(); + return false; + }; + + auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + { + auto * pos = buf.position(); + if constexpr (escaped) + nested_serialization->deserializeTextEscaped(nested_column, buf, settings); + else + nested_serialization->deserializeTextRaw(nested_column, buf, settings); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. + if (likely(!buf.hasUnreadData())) + return; + + /// We have some unread data in PeekableReadBuffer own memory. + /// It can happen only if there is a string instead of a number + /// or if someone uses tab or LF in TSV null_representation. + /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + /// We also should delete incorrectly deserialized value from nested column. + nested_column.popBack(1); + + if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) + throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " + "containing '\\t' or '\\n' may not work correctly for large input."); + + WriteBufferFromOwnString parsed_value; + if constexpr (escaped) + nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); + else + nested_serialization->serializeTextRaw(nested_column, nested_column.size() - 1, parsed_value, settings); + throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing \"{}{}\" as Nullable" + " at position {}: got \"{}\", which was deserialized as \"{}\". " + "It seems that input data is ill-formatted.", + std::string(pos, buf.buffer().end()), + std::string(istr.position(), std::min(size_t(10), istr.available())), + istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); + }; + + return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested); +} + +void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + if (col.isNullAt(row_num)) + writeCString("NULL", ostr); + else + nested->serializeTextQuoted(col.getNestedColumn(), row_num, ostr, settings); +} + + +void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextQuotedImpl<void>(column, istr, settings, nested); +} + +template<typename ReturnType> +ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + if (istr.eof() || (*istr.position() != 'N' && *istr.position() != 'n')) + { + /// This is not null, surely. + return safeDeserialize<ReturnType>(column, *nested, + [] { return false; }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextQuoted(nested_column, istr, settings); }); + } + + /// Check if we have enough data in buffer to check if it's a null. + if (istr.available() >= 4) + { + auto check_for_null = [&istr]() + { + auto * pos = istr.position(); + if (checkStringCaseInsensitive("NULL", istr)) + return true; + istr.position() = pos; + return false; + }; + auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, istr, settings); + }; + return safeDeserialize<ReturnType>(column, *nested, check_for_null, deserialize_nested); + } + + /// We don't have enough data in buffer to check if it's a NULL + /// and we cannot check it just by one symbol (otherwise we won't be able + /// to differentiate for example NULL and NaN for float) + /// Use PeekableReadBuffer to make a checkpoint before checking + /// null and rollback if the check was failed. + PeekableReadBuffer buf(istr, true); + auto check_for_null = [&buf]() + { + buf.setCheckpoint(); + SCOPE_EXIT(buf.dropCheckpoint()); + if (checkStringCaseInsensitive("NULL", buf)) + return true; + + buf.rollbackToCheckpoint(); + return false; + }; + + auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + { + nested->deserializeTextQuoted(nested_column, buf, settings); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. + if (likely(!buf.hasUnreadData())) + return; + + /// We have some unread data in PeekableReadBuffer own memory. + /// It can happen only if there is an unquoted string instead of a number. + /// We also should delete incorrectly deserialized value from nested column. + nested_column.popBack(1); + throw DB::ParsingException( + ErrorCodes::CANNOT_READ_ALL_DATA, + "Error while parsing Nullable: got an unquoted string {} instead of a number", + String(buf.position(), std::min(10ul, buf.available()))); + }; + + return safeDeserialize<ReturnType>(column, *nested, check_for_null, deserialize_nested); +} + + +void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeWholeTextImpl<void>(column, istr, settings, nested); +} + +template <typename ReturnType> +ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + PeekableReadBuffer buf(istr, true); + auto check_for_null = [&buf]() + { + buf.setCheckpoint(); + SCOPE_EXIT(buf.dropCheckpoint()); + + if (checkStringCaseInsensitive("NULL", buf) && buf.eof()) + return true; + + buf.rollbackToCheckpoint(); + if (checkStringCaseInsensitive("ᴺᵁᴸᴸ", buf) && buf.eof()) + return true; + + buf.rollbackToCheckpoint(); + return false; + }; + + auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + { + nested->deserializeWholeText(nested_column, buf, settings); + assert(!buf.hasUnreadData()); + }; + + return safeDeserialize<ReturnType>(column, *nested, check_for_null, deserialize_nested); +} + + +void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + if (col.isNullAt(row_num)) + writeString(settings.csv.null_representation, ostr); + else + nested->serializeTextCSV(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextCSVImpl<void>(column, istr, settings, nested); +} + +template<typename ReturnType> +ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested_serialization) +{ + const String & null_representation = settings.csv.null_representation; + if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) + { + /// This is not null, surely. + return safeDeserialize<ReturnType>(column, *nested_serialization, + [] { return false; }, + [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextCSV(nested_column, istr, settings); }); + } + + /// Check if we have enough data in buffer to check if it's a null. + if (settings.csv.custom_delimiter.empty() && istr.available() > null_representation.size()) + { + auto check_for_null = [&istr, &null_representation, &settings]() + { + auto * pos = istr.position(); + if (checkString(null_representation, istr) && (*istr.position() == settings.csv.delimiter || *istr.position() == '\r' || *istr.position() == '\n')) + return true; + istr.position() = pos; + return false; + }; + auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) + { + nested_serialization->deserializeTextCSV(nested_column, istr, settings); + }; + return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested); + } + + /// We don't have enough data in buffer to check if it's a null. + /// Use PeekableReadBuffer to make a checkpoint before checking null + /// representation and rollback if the check was failed. + PeekableReadBuffer buf(istr, true); + auto check_for_null = [&buf, &null_representation, &settings]() + { + buf.setCheckpoint(); + SCOPE_EXIT(buf.dropCheckpoint()); + if (checkString(null_representation, buf)) + { + if (!settings.csv.custom_delimiter.empty()) + { + if (checkString(settings.csv.custom_delimiter, buf)) + { + /// Rollback to the beginning of custom delimiter. + buf.rollbackToCheckpoint(); + assertString(null_representation, buf); + return true; + } + } + else if (buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\r' || *buf.position() == '\n') + return true; + } + + buf.rollbackToCheckpoint(); + return false; + }; + + auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + { + auto * pos = buf.position(); + nested_serialization->deserializeTextCSV(nested_column, buf, settings); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. + if (likely(!buf.hasUnreadData())) + return; + + /// We have some unread data in PeekableReadBuffer own memory. + /// It can happen only if there is an unquoted string instead of a number + /// or if someone uses csv delimiter, LF or CR in CSV null representation. + /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + /// We also should delete incorrectly deserialized value from nested column. + nested_column.popBack(1); + + if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos + || null_representation.find('\n') != std::string::npos) + throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing " + "format_csv_delimiter, '\\r' or '\\n' may not work correctly for large input."); + + WriteBufferFromOwnString parsed_value; + nested_serialization->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); + throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing \"{}{}\" as Nullable" + " at position {}: got \"{}\", which was deserialized as \"{}\". " + "It seems that input data is ill-formatted.", + std::string(pos, buf.buffer().end()), + std::string(istr.position(), std::min(size_t(10), istr.available())), + istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); + }; + + return safeDeserialize<ReturnType>(column, *nested_serialization, check_for_null, deserialize_nested); +} + +void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + /// In simple text format (like 'Pretty' format) (these formats are suitable only for output and cannot be parsed back), + /// data is printed without escaping. + /// It makes theoretically impossible to distinguish between NULL and some string value, regardless on how do we print NULL. + /// For this reason, we output NULL in a bit strange way. + /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. + + if (col.isNullAt(row_num)) + { + if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) + writeCString("ᴺᵁᴸᴸ", ostr); + else + writeCString("NULL", ostr); + } + else + nested->serializeText(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + if (col.isNullAt(row_num)) + writeCString("null", ostr); + else + nested->serializeTextJSON(col.getNestedColumn(), row_num, ostr, settings); +} + +void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONImpl<void>(column, istr, settings, nested); +} + +template<typename ReturnType> +ReturnType SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return safeDeserialize<ReturnType>(column, *nested, + [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextJSON(nested_column, istr, settings); }); +} + +void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast<const ColumnNullable &>(column); + + if (col.isNullAt(row_num)) + writeCString("\\N", ostr); + else + nested->serializeTextXML(col.getNestedColumn(), row_num, ostr, settings); +} + +template bool SerializationNullable::deserializeWholeTextImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextEscapedImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextQuotedImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextCSVImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextJSONImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextRawImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNullable.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNullable.h new file mode 100644 index 00000000000..3ec01b46de5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNullable.h @@ -0,0 +1,108 @@ +#pragma once + +#include <DataTypes/Serializations/ISerialization.h> + +namespace DB +{ + +class SerializationNullable : public ISerialization +{ +private: + SerializationPtr nested; + +public: + explicit SerializationNullable(const SerializationPtr & nested_) : nested(nested_) {} + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /** It is questionable, how NULL values could be represented in CSV. There are three variants: + * 1. \N + * 2. empty string (without quotes) + * 3. NULL + * We support all of them (however, second variant is supported by CSVRowInputFormat, not by deserializeTextCSV). + * (see also input_format_defaults_for_omitted_fields and input_format_csv_unquoted_null_literal_as_null settings) + * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. + */ + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + + /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) + /// If ReturnType is void, deserialize Nullable(T) + template <typename ReturnType = bool> + static ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template <typename ReturnType = bool> + static ReturnType deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template <typename ReturnType = bool> + static ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + template <typename ReturnType = bool> + static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template <typename ReturnType = bool> + static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + template <typename ReturnType = bool> + static ReturnType deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template <typename ReturnType = bool, bool escaped> + static ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr null_map; + + explicit SubcolumnCreator(const ColumnPtr & null_map_) : null_map(null_map_) {} + + DataTypePtr create(const DataTypePtr & prev) const override; + SerializationPtr create(const SerializationPtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + }; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNumber.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNumber.cpp new file mode 100644 index 00000000000..94b44d5cc66 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNumber.cpp @@ -0,0 +1,182 @@ +#include <DataTypes/Serializations/SerializationNumber.h> +#include <Columns/ColumnVector.h> +#include <Columns/ColumnConst.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <Common/NaNUtils.h> +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> +#include <Formats/FormatSettings.h> +#include <Formats/ProtobufReader.h> +#include <Core/Field.h> + +#include <ranges> + +namespace DB +{ + +template <typename T> +void SerializationNumber<T>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeText(assert_cast<const ColumnVector<T> &>(column).getData()[row_num], ostr); +} + +template <typename T> +void SerializationNumber<T>::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + T x; + + if constexpr (is_integer<T> && is_arithmetic_v<T>) + readIntTextUnsafe(x, istr); + else + readText(x, istr); + + assert_cast<ColumnVector<T> &>(column).getData().push_back(x); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Number"); +} + +template <typename T> +void SerializationNumber<T>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + auto x = assert_cast<const ColumnVector<T> &>(column).getData()[row_num]; + writeJSONNumber(x, ostr, settings); +} + +template <typename T> +void SerializationNumber<T>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + bool has_quote = false; + if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. + { + has_quote = true; + ++istr.position(); + } + + FieldType x; + + /// null + if (!has_quote && !istr.eof() && *istr.position() == 'n') + { + ++istr.position(); + assertString("ull", istr); + + x = NaNOrZero<T>(); + } + else + { + static constexpr bool is_uint8 = std::is_same_v<T, UInt8>; + static constexpr bool is_int8 = std::is_same_v<T, Int8>; + + if (settings.json.read_bools_as_numbers || is_uint8 || is_int8) + { + // extra conditions to parse true/false strings into 1/0 + if (istr.eof()) + throwReadAfterEOF(); + if (*istr.position() == 't' || *istr.position() == 'f') + { + bool tmp = false; + readBoolTextWord(tmp, istr); + x = tmp; + } + else + readText(x, istr); + } + else + { + readText(x, istr); + } + + if (has_quote) + assertChar('"', istr); + } + + assert_cast<ColumnVector<T> &>(column).getData().push_back(x); +} + +template <typename T> +void SerializationNumber<T>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & /*settings*/) const +{ + FieldType x; + readCSV(x, istr); + assert_cast<ColumnVector<T> &>(column).getData().push_back(x); +} + +template <typename T> +void SerializationNumber<T>::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + /// ColumnVector<T>::ValueType is a narrower type. For example, UInt8, when the Field type is UInt64 + typename ColumnVector<T>::ValueType x = static_cast<typename ColumnVector<T>::ValueType>(field.get<FieldType>()); + writeBinaryLittleEndian(x, ostr); +} + +template <typename T> +void SerializationNumber<T>::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const +{ + typename ColumnVector<T>::ValueType x; + readBinaryLittleEndian(x, istr); + field = NearestFieldType<FieldType>(x); +} + +template <typename T> +void SerializationNumber<T>::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeBinaryLittleEndian(assert_cast<const ColumnVector<T> &>(column).getData()[row_num], ostr); +} + +template <typename T> +void SerializationNumber<T>::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + typename ColumnVector<T>::ValueType x; + readBinaryLittleEndian(x, istr); + assert_cast<ColumnVector<T> &>(column).getData().push_back(x); +} + +template <typename T> +void SerializationNumber<T>::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnVector<T>::Container & x = typeid_cast<const ColumnVector<T> &>(column).getData(); + if (const size_t size = x.size(); limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit == 0) + return; + + if constexpr (std::endian::native == std::endian::big && sizeof(T) >= 2) + for (size_t i = offset; i < offset + limit; ++i) + writeBinaryLittleEndian(x[i], ostr); + else + ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(typename ColumnVector<T>::ValueType) * limit); +} + +template <typename T> +void SerializationNumber<T>::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + typename ColumnVector<T>::Container & x = typeid_cast<ColumnVector<T> &>(column).getData(); + const size_t initial_size = x.size(); + x.resize(initial_size + limit); + const size_t size = istr.readBig(reinterpret_cast<char*>(&x[initial_size]), sizeof(typename ColumnVector<T>::ValueType) * limit); + x.resize(initial_size + size / sizeof(typename ColumnVector<T>::ValueType)); + + if constexpr (std::endian::native == std::endian::big && sizeof(T) >= 2) + for (size_t i = initial_size; i < x.size(); ++i) + transformEndianness<std::endian::big, std::endian::little>(x[i]); +} + +template class SerializationNumber<UInt8>; +template class SerializationNumber<UInt16>; +template class SerializationNumber<UInt32>; +template class SerializationNumber<UInt64>; +template class SerializationNumber<UInt128>; +template class SerializationNumber<UInt256>; +template class SerializationNumber<Int8>; +template class SerializationNumber<Int16>; +template class SerializationNumber<Int32>; +template class SerializationNumber<Int64>; +template class SerializationNumber<Int128>; +template class SerializationNumber<Int256>; +template class SerializationNumber<Float32>; +template class SerializationNumber<Float64>; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationNumber.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNumber.h new file mode 100644 index 00000000000..972c6c9a30f --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationNumber.h @@ -0,0 +1,36 @@ +#pragma once + +#include <Core/Types.h> +#include <DataTypes/Serializations/SimpleTextSerialization.h> + +namespace DB +{ + +template <typename T> +class ColumnVector; + +template <typename T> +class SerializationNumber : public SimpleTextSerialization +{ + static_assert(is_arithmetic_v<T>); + +public: + using FieldType = T; + using ColumnType = ColumnVector<T>; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + /** Format is platform-dependent. */ + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationObject.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationObject.cpp new file mode 100644 index 00000000000..df9489213c8 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationObject.cpp @@ -0,0 +1,557 @@ +#include <DataTypes/Serializations/SerializationObject.h> +#include <DataTypes/Serializations/JSONDataParser.h> +#include <DataTypes/Serializations/SerializationString.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/ObjectUtils.h> +#include <DataTypes/DataTypeFactory.h> +#include <DataTypes/NestedUtils.h> +#include <Common/JSONParsers/SimdJSONParser.h> +#include <Common/JSONParsers/RapidJSONParser.h> +#include <Common/HashTable/HashSet.h> +#include <Columns/ColumnObject.h> +#include <Columns/ColumnString.h> +#include <Functions/FunctionsConversion.h> + +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/VarInt.h> +#include <magic_enum.hpp> +#include <memory> +#include <string> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_DATA; + extern const int CANNOT_READ_ALL_DATA; + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int LOGICAL_ERROR; +} + +template <typename Parser> +template <typename Reader> +void SerializationObject<Parser>::deserializeTextImpl(IColumn & column, Reader && reader) const +{ + auto & column_object = assert_cast<ColumnObject &>(column); + + String buf; + reader(buf); + std::optional<ParseResult> result; + + /// Treat empty string as an empty object + /// for better CAST from String to Object. + if (!buf.empty()) + { + auto parser = parsers_pool.get([] { return new Parser; }); + result = parser->parse(buf.data(), buf.size()); + } + else + { + result = ParseResult{}; + } + + if (!result) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse object"); + + auto & [paths, values] = *result; + assert(paths.size() == values.size()); + + size_t old_column_size = column_object.size(); + for (size_t i = 0; i < paths.size(); ++i) + { + auto field_info = getFieldInfo(values[i]); + if (field_info.need_fold_dimension) + values[i] = applyVisitor(FieldVisitorFoldDimension(field_info.num_dimensions), std::move(values[i])); + if (isNothing(field_info.scalar_type)) + continue; + + if (!column_object.hasSubcolumn(paths[i])) + { + if (paths[i].hasNested()) + column_object.addNestedSubcolumn(paths[i], field_info, old_column_size); + else + column_object.addSubcolumn(paths[i], old_column_size); + } + + auto & subcolumn = column_object.getSubcolumn(paths[i]); + assert(subcolumn.size() == old_column_size); + + subcolumn.insert(std::move(values[i]), std::move(field_info)); + } + + /// Insert default values to missed subcolumns. + const auto & subcolumns = column_object.getSubcolumns(); + for (const auto & entry : subcolumns) + { + if (entry->data.size() == old_column_size) + { + bool inserted = column_object.tryInsertDefaultFromNested(entry); + if (!inserted) + entry->data.insertDefault(); + } + } + + column_object.incrementNumRows(); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readStringInto(s, istr); }); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readEscapedString(s, istr); }); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readQuotedStringInto<true>(s, istr); }); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { Parser::readJSON(s, istr); }); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, [&](String & s) { readCSVStringInto(s, istr, settings.csv); }); +} + +template <typename Parser> +template <typename TSettings> +void SerializationObject<Parser>::checkSerializationIsSupported(const TSettings & settings) const +{ + if (settings.position_independent_encoding) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with position independent encoding"); +} + +template <typename Parser> +struct SerializationObject<Parser>::SerializeStateObject : public ISerialization::SerializeBinaryBulkState +{ + DataTypePtr nested_type; + SerializationPtr nested_serialization; + SerializeBinaryBulkStatePtr nested_state; +}; + +template <typename Parser> +struct SerializationObject<Parser>::DeserializeStateObject : public ISerialization::DeserializeBinaryBulkState +{ + BinarySerializationKind kind; + DataTypePtr nested_type; + SerializationPtr nested_serialization; + DeserializeBinaryBulkStatePtr nested_state; +}; + +template <typename Parser> +void SerializationObject<Parser>::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + if (state) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with non-trivial state"); + + const auto & column_object = assert_cast<const ColumnObject &>(column); + if (!column_object.isFinalized()) + { + auto finalized = column_object.cloneFinalized(); + serializeBinaryBulkStatePrefix(*finalized, settings, state); + return; + } + + settings.path.push_back(Substream::ObjectStructure); + auto * stream = settings.getter(settings.path); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for kind of binary serialization"); + + auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object); + + writeIntBinary(static_cast<UInt8>(BinarySerializationKind::TUPLE), *stream); + writeStringBinary(tuple_type->getName(), *stream); + + auto state_object = std::make_shared<SerializeStateObject>(); + state_object->nested_type = tuple_type; + state_object->nested_serialization = tuple_type->getDefaultSerialization(); + + settings.path.back() = Substream::ObjectData; + state_object->nested_serialization->serializeBinaryBulkStatePrefix(*tuple_column, settings, state_object->nested_state); + + state = std::move(state_object); + settings.path.pop_back(); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + auto * state_object = checkAndGetState<SerializeStateObject>(state); + + settings.path.push_back(Substream::ObjectData); + state_object->nested_serialization->serializeBinaryBulkStateSuffix(settings, state_object->nested_state); + settings.path.pop_back(); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + if (state) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with non-trivial state"); + + settings.path.push_back(Substream::ObjectStructure); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, + "Cannot read kind of binary serialization of DataTypeObject, because its stream is missing"); + + UInt8 kind_raw; + readIntBinary(kind_raw, *stream); + auto kind = magic_enum::enum_cast<BinarySerializationKind>(kind_raw); + if (!kind) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Unknown binary serialization kind of Object: {}", std::to_string(kind_raw)); + + auto state_object = std::make_shared<DeserializeStateObject>(); + state_object->kind = *kind; + + if (state_object->kind == BinarySerializationKind::TUPLE) + { + String data_type_name; + readStringBinary(data_type_name, *stream); + state_object->nested_type = DataTypeFactory::instance().get(data_type_name); + state_object->nested_serialization = state_object->nested_type->getDefaultSerialization(); + + if (!isTuple(state_object->nested_type)) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Data of type Object should be written as Tuple, got: {}", data_type_name); + } + else if (state_object->kind == BinarySerializationKind::STRING) + { + state_object->nested_type = std::make_shared<DataTypeString>(); + state_object->nested_serialization = std::make_shared<SerializationString>(); + } + else + { + throw Exception(ErrorCodes::INCORRECT_DATA, + "Unknown binary serialization kind of Object: {}", std::to_string(kind_raw)); + } + + settings.path.push_back(Substream::ObjectData); + state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state); + settings.path.pop_back(); + + state = std::move(state_object); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings); + const auto & column_object = assert_cast<const ColumnObject &>(column); + auto * state_object = checkAndGetState<SerializeStateObject>(state); + + if (!column_object.isFinalized()) + { + auto finalized = column_object.cloneFinalized(); + serializeBinaryBulkWithMultipleStreams(*finalized, offset, limit, settings, state); + return; + } + + auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object); + + if (!state_object->nested_type->equals(*tuple_type)) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Types of internal column of Object mismatched. Expected: {}, Got: {}", + state_object->nested_type->getName(), tuple_type->getName()); + } + + settings.path.push_back(Substream::ObjectData); + if (auto * stream = settings.getter(settings.path)) + { + state_object->nested_serialization->serializeBinaryBulkWithMultipleStreams( + *tuple_column, offset, limit, settings, state_object->nested_state); + } + + settings.path.pop_back(); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + checkSerializationIsSupported(settings); + if (!column->empty()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject cannot be deserialized to non-empty column"); + + auto mutable_column = column->assumeMutable(); + auto & column_object = assert_cast<ColumnObject &>(*mutable_column); + auto * state_object = checkAndGetState<DeserializeStateObject>(state); + + settings.path.push_back(Substream::ObjectData); + if (state_object->kind == BinarySerializationKind::STRING) + deserializeBinaryBulkFromString(column_object, limit, settings, *state_object, cache); + else + deserializeBinaryBulkFromTuple(column_object, limit, settings, *state_object, cache); + + settings.path.pop_back(); + column_object.checkConsistency(); + column_object.finalize(); + column = std::move(mutable_column); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeBinaryBulkFromString( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const +{ + ColumnPtr column_string = state.nested_type->createColumn(); + state.nested_serialization->deserializeBinaryBulkWithMultipleStreams( + column_string, limit, settings, state.nested_state, cache); + + ConvertImplGenericFromString<ColumnString>::executeImpl(*column_string, column_object, *this, column_string->size()); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeBinaryBulkFromTuple( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const +{ + ColumnPtr column_tuple = state.nested_type->createColumn(); + state.nested_serialization->deserializeBinaryBulkWithMultipleStreams( + column_tuple, limit, settings, state.nested_state, cache); + + auto [tuple_paths, tuple_types] = flattenTuple(state.nested_type); + auto flattened_tuple = flattenTuple(column_tuple); + const auto & tuple_columns = assert_cast<const ColumnTuple &>(*flattened_tuple).getColumns(); + + assert(tuple_paths.size() == tuple_types.size()); + size_t num_subcolumns = tuple_paths.size(); + + if (tuple_columns.size() != num_subcolumns) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Inconsistent type ({}) and column ({}) while reading column of type Object", + state.nested_type->getName(), column_tuple->getName()); + + for (size_t i = 0; i < num_subcolumns; ++i) + column_object.addSubcolumn(tuple_paths[i], tuple_columns[i]->assumeMutable()); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeBinary(const Field &, WriteBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeBinary(Field &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeBinary(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template <typename Parser> +void SerializationObject<Parser>::deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +/// TODO: use format different of JSON in serializations. + +template <typename Parser> +void SerializationObject<Parser>::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_object = assert_cast<const ColumnObject &>(column); + const auto & subcolumns = column_object.getSubcolumns(); + + writeChar('{', ostr); + for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it) + { + const auto & entry = *it; + if (it != subcolumns.begin()) + writeCString(",", ostr); + + writeDoubleQuoted(entry->path.getPath(), ostr); + writeChar(':', ostr); + serializeTextFromSubcolumn(entry->data, row_num, ostr, settings); + } + writeChar('}', ostr); +} + +template <typename Parser> +template <bool pretty_json> +void SerializationObject<Parser>::serializeTextFromSubcolumn( + const ColumnObject::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const +{ + const auto & least_common_type = subcolumn.getLeastCommonType(); + + if (subcolumn.isFinalized()) + { + const auto & finalized_column = subcolumn.getFinalizedColumn(); + auto info = least_common_type->getSerializationInfo(finalized_column); + auto serialization = least_common_type->getSerialization(*info); + if constexpr (pretty_json) + serialization->serializeTextJSONPretty(finalized_column, row_num, ostr, settings, indent); + else + serialization->serializeTextJSON(finalized_column, row_num, ostr, settings); + return; + } + + size_t ind = row_num; + if (ind < subcolumn.getNumberOfDefaultsInPrefix()) + { + /// Suboptimal, but it should happen rarely. + auto tmp_column = subcolumn.getLeastCommonType()->createColumn(); + tmp_column->insertDefault(); + + auto info = least_common_type->getSerializationInfo(*tmp_column); + auto serialization = least_common_type->getSerialization(*info); + if constexpr (pretty_json) + serialization->serializeTextJSONPretty(*tmp_column, 0, ostr, settings, indent); + else + serialization->serializeTextJSON(*tmp_column, 0, ostr, settings); + return; + } + + ind -= subcolumn.getNumberOfDefaultsInPrefix(); + for (const auto & part : subcolumn.getData()) + { + if (ind < part->size()) + { + auto part_type = getDataTypeByColumn(*part); + auto info = part_type->getSerializationInfo(*part); + auto serialization = part_type->getSerialization(*info); + if constexpr (pretty_json) + serialization->serializeTextJSONPretty(*part, ind, ostr, settings, indent); + else + serialization->serializeTextJSON(*part, ind, ostr, settings); + return; + } + + ind -= part->size(); + } + + throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Index ({}) for text serialization is out of range", row_num); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, settings); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeEscapedString(ostr_str.str(), ostr); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeQuotedString(ostr_str.str(), ostr); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, settings); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeCSVString(ostr_str.str(), ostr); +} + +template <typename Parser> +void SerializationObject<Parser>::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const +{ + const auto & column_object = assert_cast<const ColumnObject &>(column); + const auto & subcolumns = column_object.getSubcolumns(); + + writeCString("{\n", ostr); + for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it) + { + const auto & entry = *it; + if (it != subcolumns.begin()) + writeCString(",\n", ostr); + + writeChar(' ', (indent + 1) * 4, ostr); + writeDoubleQuoted(entry->path.getPath(), ostr); + writeCString(": ", ostr); + serializeTextFromSubcolumn<true>(entry->data, row_num, ostr, settings, indent + 1); + } + writeChar('\n', ostr); + writeChar(' ', indent * 4, ostr); + writeChar('}', ostr); +} + + +SerializationPtr getObjectSerialization(const String & schema_format) +{ + if (schema_format == "json") + { +#if USE_SIMDJSON + return std::make_shared<SerializationObject<JSONDataParser<SimdJSONParser>>>(); +#elif USE_RAPIDJSON + return std::make_shared<SerializationObject<JSONDataParser<RapidJSONParser>>>(); +#else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "To use data type Object with JSON format ClickHouse should be built with Simdjson or Rapidjson"); +#endif + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown schema format '{}'", schema_format); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationObject.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationObject.h new file mode 100644 index 00000000000..de54f5739f5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationObject.h @@ -0,0 +1,119 @@ +#pragma once + +#include <Columns/ColumnObject.h> +#include <DataTypes/Serializations/SimpleTextSerialization.h> +#include <Common/ObjectPool.h> + +namespace DB +{ + +/** Serialization for data type Object. + * Supported only text serialization/deserialization. + * and binary bulk serialization/deserialization without position independent + * encoding, i.e. serialization/deserialization into Native format. + */ +template <typename Parser> +class SerializationObject : public ISerialization +{ +public: + /** In Native format ColumnObject can be serialized + * in two formats: as Tuple or as String. + * The format is the following: + * + * <serialization_kind> 1 byte -- 0 if Tuple, 1 if String. + * [type_name] -- Only for tuple serialization. + * ... data of internal column ... + * + * ClickHouse client serializazes objects as tuples. + * String serialization exists for clients, which cannot + * do parsing by themselves and they can send raw data as + * string. It will be parsed on the server side. + */ + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + +private: + enum class BinarySerializationKind : UInt8 + { + TUPLE = 0, + STRING = 1, + }; + + struct SerializeStateObject; + struct DeserializeStateObject; + + void deserializeBinaryBulkFromString( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const; + + void deserializeBinaryBulkFromTuple( + ColumnObject & column_object, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeStateObject & state, + SubstreamsCache * cache) const; + + template <typename TSettings> + void checkSerializationIsSupported(const TSettings & settings) const; + + template <typename Reader> + void deserializeTextImpl(IColumn & column, Reader && reader) const; + + void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + + template <bool pretty_json = false> + void serializeTextFromSubcolumn(const ColumnObject::Subcolumn & subcolumn, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent = 0) const; + + /// Pool of parser objects to make SerializationObject thread safe. + mutable SimpleObjectPool<Parser> parsers_pool; +}; + +SerializationPtr getObjectSerialization(const String & schema_format); + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationSparse.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationSparse.cpp new file mode 100644 index 00000000000..4d7514271ad --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationSparse.cpp @@ -0,0 +1,387 @@ +#include <DataTypes/Serializations/SerializationSparse.h> +#include <DataTypes/DataTypesNumber.h> +#include <Columns/IColumn.h> +#include <Columns/ColumnVector.h> +#include <Columns/ColumnSparse.h> +#include <Common/assert_cast.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/VarInt.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; +} + +namespace +{ + +/// 2^62, because VarInt supports only values < 2^63. +constexpr auto END_OF_GRANULE_FLAG = 1ULL << 62; + +struct DeserializeStateSparse : public ISerialization::DeserializeBinaryBulkState +{ + /// Number of default values, that remain from previous read. + size_t num_trailing_defaults = 0; + /// Do we have non-default value after @num_trailing_defaults? + bool has_value_after_defaults = false; + ISerialization::DeserializeBinaryBulkStatePtr nested; + + void reset() + { + num_trailing_defaults = 0; + has_value_after_defaults = false; + } +}; + +void serializeOffsets(const IColumn::Offsets & offsets, WriteBuffer & ostr, size_t start, size_t end) +{ + size_t size = offsets.size(); + for (size_t i = 0; i < size; ++i) + { + size_t group_size = offsets[i] - start; + writeVarUInt(group_size, ostr); + start += group_size + 1; + } + + size_t group_size = start < end ? end - start : 0; + group_size |= END_OF_GRANULE_FLAG; + writeVarUInt(group_size, ostr); +} + + +/// Returns number of read rows. +/// @start is the size of column before reading offsets. +size_t deserializeOffsets(IColumn::Offsets & offsets, + ReadBuffer & istr, size_t start, size_t limit, DeserializeStateSparse & state) +{ + if (limit && state.num_trailing_defaults >= limit) + { + state.num_trailing_defaults -= limit; + return limit; + } + + /// Just try to guess number of offsets. + offsets.reserve(offsets.size() + + static_cast<size_t>(limit * (1.0 - ColumnSparse::DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION))); + + bool first = true; + size_t total_rows = state.num_trailing_defaults; + if (state.has_value_after_defaults) + { + offsets.push_back(start + state.num_trailing_defaults); + first = false; + + state.has_value_after_defaults = false; + state.num_trailing_defaults = 0; + ++total_rows; + } + + size_t group_size; + while (!istr.eof()) + { + readVarUInt(group_size, istr); + + bool end_of_granule = group_size & END_OF_GRANULE_FLAG; + group_size &= ~END_OF_GRANULE_FLAG; + + size_t next_total_rows = total_rows + group_size; + group_size += state.num_trailing_defaults; + + if (limit && next_total_rows >= limit) + { + /// If it was not last group in granule, + /// we have to add current non-default value at further reads. + state.num_trailing_defaults = next_total_rows - limit; + state.has_value_after_defaults = !end_of_granule; + return limit; + } + + if (end_of_granule) + { + state.has_value_after_defaults = false; + state.num_trailing_defaults = group_size; + } + else + { + /// If we add value to column for first time in current read, + /// start from column's current size, because it can have some defaults after last offset, + /// otherwise just start from previous offset. + size_t start_of_group = start; + if (!first && !offsets.empty()) + start_of_group = offsets.back() + 1; + if (first) + first = false; + + offsets.push_back(start_of_group + group_size); + + state.num_trailing_defaults = 0; + state.has_value_after_defaults = false; + ++next_total_rows; + } + + total_rows = next_total_rows; + } + + return total_rows; +} + +} + +SerializationSparse::SerializationSparse(const SerializationPtr & nested_) + : nested(nested_) +{ +} + +SerializationPtr SerializationSparse::SubcolumnCreator::create(const SerializationPtr & prev) const +{ + return std::make_shared<SerializationSparse>(prev); +} + +ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev) const +{ + return ColumnSparse::create(prev, offsets, size); +} + +void SerializationSparse::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr; + size_t column_size = column_sparse ? column_sparse->size() : 0; + + settings.path.push_back(Substream::SparseOffsets); + auto offsets_data = SubstreamData(std::make_shared<SerializationNumber<UInt64>>()) + .withType(data.type ? std::make_shared<DataTypeUInt64>() : nullptr) + .withColumn(column_sparse ? column_sparse->getOffsetsPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = offsets_data; + callback(settings.path); + + settings.path.back() = Substream::SparseElements; + settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets_data.column, column_size); + settings.path.back().data = data; + + auto next_data = SubstreamData(nested) + .withType(data.type) + .withColumn(column_sparse ? column_sparse->getValuesPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + nested->enumerateStreams(settings, callback, next_data); + settings.path.pop_back(); +} + +void SerializationSparse::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::SparseElements); + if (const auto * column_sparse = typeid_cast<const ColumnSparse *>(&column)) + nested->serializeBinaryBulkStatePrefix(column_sparse->getValuesColumn(), settings, state); + else + nested->serializeBinaryBulkStatePrefix(column, settings, state); + + settings.path.pop_back(); +} + +void SerializationSparse::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + size_t size = column.size(); + + auto offsets_column = DataTypeNumber<IColumn::Offset>().createColumn(); + auto & offsets_data = assert_cast<ColumnVector<IColumn::Offset> &>(*offsets_column).getData(); + column.getIndicesOfNonDefaultRows(offsets_data, offset, limit); + + settings.path.push_back(Substream::SparseOffsets); + if (auto * stream = settings.getter(settings.path)) + { + size_t end = limit && offset + limit < size ? offset + limit : size; + serializeOffsets(offsets_data, *stream, offset, end); + } + + if (!offsets_data.empty()) + { + settings.path.back() = Substream::SparseElements; + if (const auto * column_sparse = typeid_cast<const ColumnSparse *>(&column)) + { + const auto & values = column_sparse->getValuesColumn(); + size_t begin = column_sparse->getValueIndex(offsets_data[0]); + size_t end = column_sparse->getValueIndex(offsets_data.back()); + nested->serializeBinaryBulkWithMultipleStreams(values, begin, end - begin + 1, settings, state); + } + else + { + auto values = column.index(*offsets_column, 0); + nested->serializeBinaryBulkWithMultipleStreams(*values, 0, values->size(), settings, state); + } + } + + settings.path.pop_back(); +} + +void SerializationSparse::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + settings.path.push_back(Substream::SparseElements); + nested->serializeBinaryBulkStateSuffix(settings, state); + settings.path.pop_back(); +} + +void SerializationSparse::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto state_sparse = std::make_shared<DeserializeStateSparse>(); + + settings.path.push_back(Substream::SparseElements); + nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested); + settings.path.pop_back(); + + state = std::move(state_sparse); +} + +void SerializationSparse::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto * state_sparse = checkAndGetState<DeserializeStateSparse>(state); + + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + column = cached_column; + return; + } + + if (!settings.continuous_reading) + state_sparse->reset(); + + auto mutable_column = column->assumeMutable(); + auto & column_sparse = assert_cast<ColumnSparse &>(*mutable_column); + auto & offsets_data = column_sparse.getOffsetsData(); + + size_t old_size = offsets_data.size(); + + size_t read_rows = 0; + settings.path.push_back(Substream::SparseOffsets); + if (auto * stream = settings.getter(settings.path)) + read_rows = deserializeOffsets(offsets_data, *stream, column_sparse.size(), limit, *state_sparse); + + auto & values_column = column_sparse.getValuesPtr(); + size_t values_limit = offsets_data.size() - old_size; + + settings.path.back() = Substream::SparseElements; + /// Do not use substream cache while reading values column, because ColumnSparse can be cached only in a whole. + nested->deserializeBinaryBulkWithMultipleStreams(values_column, values_limit, settings, state_sparse->nested, nullptr); + settings.path.pop_back(); + + if (offsets_data.size() + 1 != values_column->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Inconsistent sizes of values and offsets in SerializationSparse." + " Offsets size: {}, values size: {}", offsets_data.size(), values_column->size()); + + /// 'insertManyDefaults' just increases size of column. + column_sparse.insertManyDefaults(read_rows); + column = std::move(mutable_column); + addToSubstreamsCache(cache, settings.path, column); +} + +/// All methods below just wrap nested serialization. + +void SerializationSparse::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested->serializeBinary(field, ostr, settings); +} + +void SerializationSparse::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested->deserializeBinary(field, istr, settings); +} + +void SerializationSparse::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast<const ColumnSparse &>(column); + nested->serializeBinary(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeBinary' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast<const ColumnSparse &>(column); + nested->serializeTextEscaped(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextEscaped(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextEscaped' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast<const ColumnSparse &>(column); + nested->serializeTextQuoted(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextQuoted(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextQuoted' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast<const ColumnSparse &>(column); + nested->serializeTextCSV(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextCSV(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextCSV' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast<const ColumnSparse &>(column); + nested->serializeText(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeWholeText(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeWholeText' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast<const ColumnSparse &>(column); + nested->serializeTextJSON(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +void SerializationSparse::deserializeTextJSON(IColumn &, ReadBuffer &, const FormatSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextJSON' is not implemented for SerializationSparse"); +} + +void SerializationSparse::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_sparse = assert_cast<const ColumnSparse &>(column); + nested->serializeTextXML(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationSparse.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationSparse.h new file mode 100644 index 00000000000..2d31fba2509 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationSparse.h @@ -0,0 +1,104 @@ +#pragma once + +#include <DataTypes/Serializations/ISerialization.h> + +namespace DB +{ + + +/** Serialization for sparse representation. + * Only '{serialize,deserialize}BinaryBulk' makes sense. + * Format: + * Values and offsets are written to separate substreams. + * There are written only non-default values. + * + * Offsets have position independent format: as i-th offset there + * is written number of default values, that precedes the i-th non-default value. + * Offsets are written in VarInt encoding. + * Additionally at the end of every call of 'serializeBinaryBulkWithMultipleStreams' + * there is written number of default values in the suffix of part of column, + * that we currently writing. This value also marked with a flag, that means the end of portion of data. + * This value is used, e.g. to allow independent reading of granules in MergeTree. + */ +class SerializationSparse final : public ISerialization +{ +public: + explicit SerializationSparse(const SerializationPtr & nested_); + + Kind getKind() const override { return Kind::SPARSE; } + + virtual void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + /// Allows to write ColumnSparse and other columns in sparse serialization. + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + /// Allows to read only ColumnSparse. + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + struct SubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr offsets; + const size_t size; + + SubcolumnCreator(const ColumnPtr & offsets_, size_t size_) + : offsets(offsets_), size(size_) {} + + DataTypePtr create(const DataTypePtr & prev) const override { return prev; } + SerializationPtr create(const SerializationPtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + }; + + SerializationPtr nested; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationString.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationString.cpp new file mode 100644 index 00000000000..46fd9d5272d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationString.cpp @@ -0,0 +1,365 @@ +#include <DataTypes/Serializations/SerializationString.h> + +#include <Columns/ColumnString.h> + +#include <Common/typeid_cast.h> +#include <Common/assert_cast.h> + +#include <Core/Field.h> + +#include <Formats/FormatSettings.h> + +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <IO/VarInt.h> +#include <IO/ReadBufferFromString.h> + +#include <base/unit.h> + +#ifdef __SSE2__ + #include <emmintrin.h> +#endif + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; + extern const int TOO_LARGE_STRING_SIZE; +} + +void SerializationString::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const String & s = field.get<const String &>(); + if (settings.max_binary_string_size && s.size() > settings.max_binary_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "Too large string size: {}. The maximum is: {}. To increase the maximum, use setting " + "format_binary_max_string_size", + s.size(), + settings.max_binary_string_size); + + writeVarUInt(s.size(), ostr); + writeString(s, ostr); +} + + +void SerializationString::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + UInt64 size; + readVarUInt(size, istr); + if (settings.max_binary_string_size && size > settings.max_binary_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "Too large string size: {}. The maximum is: {}. To increase the maximum, use setting " + "format_binary_max_string_size", + size, + settings.max_binary_string_size); + + field = String(); + String & s = field.get<String &>(); + s.resize(size); + istr.readStrict(s.data(), size); +} + + +void SerializationString::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const StringRef & s = assert_cast<const ColumnString &>(column).getDataAt(row_num); + if (settings.max_binary_string_size && s.size > settings.max_binary_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "Too large string size: {}. The maximum is: {}. To increase the maximum, use setting " + "format_binary_max_string_size", + s.size, + settings.max_binary_string_size); + + writeVarUInt(s.size, ostr); + writeString(s, ostr); +} + + +void SerializationString::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnString & column_string = assert_cast<ColumnString &>(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + + UInt64 size; + readVarUInt(size, istr); + if (settings.max_binary_string_size && size > settings.max_binary_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "Too large string size: {}. The maximum is: {}. To increase the maximum, use setting " + "format_binary_max_string_size", + size, + settings.max_binary_string_size); + + size_t old_chars_size = data.size(); + size_t offset = old_chars_size + size + 1; + offsets.push_back(offset); + + try + { + data.resize(offset); + istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size); + data.back() = 0; + } + catch (...) + { + offsets.pop_back(); + data.resize_assume_reserved(old_chars_size); + throw; + } +} + + +void SerializationString::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const ColumnString & column_string = typeid_cast<const ColumnString &>(column); + const ColumnString::Chars & data = column_string.getChars(); + const ColumnString::Offsets & offsets = column_string.getOffsets(); + + size_t size = column_string.size(); + if (!size) + return; + + size_t end = limit && offset + limit < size + ? offset + limit + : size; + + if (offset == 0) + { + UInt64 str_size = offsets[0] - 1; + writeVarUInt(str_size, ostr); + ostr.write(reinterpret_cast<const char *>(data.data()), str_size); + + ++offset; + } + + for (size_t i = offset; i < end; ++i) + { + UInt64 str_size = offsets[i] - offsets[i - 1] - 1; + writeVarUInt(str_size, ostr); + ostr.write(reinterpret_cast<const char *>(&data[offsets[i - 1]]), str_size); + } +} + + +template <int UNROLL_TIMES> +static NO_INLINE void deserializeBinarySSE2(ColumnString::Chars & data, ColumnString::Offsets & offsets, ReadBuffer & istr, size_t limit) +{ + size_t offset = data.size(); + for (size_t i = 0; i < limit; ++i) + { + if (istr.eof()) + break; + + UInt64 size; + readVarUInt(size, istr); + + static constexpr size_t max_string_size = 16_GiB; /// Arbitrary value to prevent logical errors and overflows, but large enough. + if (size > max_string_size) + throw Exception( + ErrorCodes::TOO_LARGE_STRING_SIZE, + "Too large string size: {}. The maximum is: {}.", + size, + max_string_size); + + offset += size + 1; + offsets.push_back(offset); + + data.resize(offset); + + if (size) + { +#ifdef __SSE2__ + /// An optimistic branch in which more efficient copying is possible. + if (offset + 16 * UNROLL_TIMES <= data.capacity() && istr.position() + size + 16 * UNROLL_TIMES <= istr.buffer().end()) + { + const __m128i * sse_src_pos = reinterpret_cast<const __m128i *>(istr.position()); + const __m128i * sse_src_end = sse_src_pos + (size + (16 * UNROLL_TIMES - 1)) / 16 / UNROLL_TIMES * UNROLL_TIMES; + __m128i * sse_dst_pos = reinterpret_cast<__m128i *>(&data[offset - size - 1]); + + while (sse_src_pos < sse_src_end) + { + for (size_t j = 0; j < UNROLL_TIMES; ++j) + _mm_storeu_si128(sse_dst_pos + j, _mm_loadu_si128(sse_src_pos + j)); + + sse_src_pos += UNROLL_TIMES; + sse_dst_pos += UNROLL_TIMES; + } + + istr.position() += size; + } + else +#endif + { + istr.readStrict(reinterpret_cast<char*>(&data[offset - size - 1]), size); + } + } + + data[offset - 1] = 0; + } +} + + +void SerializationString::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const +{ + ColumnString & column_string = typeid_cast<ColumnString &>(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + + double avg_chars_size = 1; /// By default reserve only for empty strings. + + if (avg_value_size_hint > 0.0 && avg_value_size_hint > sizeof(offsets[0])) + { + /// Randomly selected. + constexpr auto avg_value_size_hint_reserve_multiplier = 1.2; + + avg_chars_size = (avg_value_size_hint - sizeof(offsets[0])) * avg_value_size_hint_reserve_multiplier; + } + + size_t size_to_reserve = data.size() + static_cast<size_t>(std::ceil(limit * avg_chars_size)); + + /// Never reserve for too big size. + if (size_to_reserve < 256 * 1024 * 1024) + { + try + { + data.reserve(size_to_reserve); + } + catch (Exception & e) + { + e.addMessage( + "(avg_value_size_hint = " + toString(avg_value_size_hint) + + ", avg_chars_size = " + toString(avg_chars_size) + + ", limit = " + toString(limit) + ")"); + throw; + } + } + + offsets.reserve(offsets.size() + limit); + + if (avg_chars_size >= 64) + deserializeBinarySSE2<4>(data, offsets, istr, limit); + else if (avg_chars_size >= 48) + deserializeBinarySSE2<3>(data, offsets, istr, limit); + else if (avg_chars_size >= 32) + deserializeBinarySSE2<2>(data, offsets, istr, limit); + else + deserializeBinarySSE2<1>(data, offsets, istr, limit); +} + + +void SerializationString::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); +} + + +void SerializationString::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeEscapedString(assert_cast<const ColumnString &>(column).getDataAt(row_num).toView(), ostr); +} + + +template <typename Reader> +static inline void read(IColumn & column, Reader && reader) +{ + ColumnString & column_string = assert_cast<ColumnString &>(column); + ColumnString::Chars & data = column_string.getChars(); + ColumnString::Offsets & offsets = column_string.getOffsets(); + size_t old_chars_size = data.size(); + size_t old_offsets_size = offsets.size(); + try + { + reader(data); + data.push_back(0); + offsets.push_back(data.size()); + } + catch (...) + { + offsets.resize_assume_reserved(old_offsets_size); + data.resize_assume_reserved(old_chars_size); + throw; + } +} + + +void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); +} + + +void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); +} + + +void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeQuotedString(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); +} + + +void SerializationString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + read(column, [&](ColumnString::Chars & data) { readQuotedStringInto<true>(data, istr); }); +} + + +void SerializationString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeJSONString(assert_cast<const ColumnString &>(column).getDataAt(row_num).toView(), ostr, settings); +} + + +void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') + { + String field; + readJSONObjectPossiblyInvalid(field, istr); + ReadBufferFromString buf(field); + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + } + else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') + { + String field; + readJSONField(field, istr); + Float64 tmp; + ReadBufferFromString buf(field); + if (tryReadFloatText(tmp, buf) && buf.eof()) + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + else + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON String value here: {}", field); + } + else + read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); +} + + +void SerializationString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeXMLStringForTextElement(assert_cast<const ColumnString &>(column).getDataAt(row_num).toView(), ostr); +} + + +void SerializationString::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeCSVString<>(assert_cast<const ColumnString &>(column).getDataAt(row_num), ostr); +} + + +void SerializationString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); +} + + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationString.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationString.h new file mode 100644 index 00000000000..f27a5116c15 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationString.h @@ -0,0 +1,37 @@ +#pragma once + +#include <DataTypes/Serializations/ISerialization.h> + +namespace DB +{ + +class SerializationString final : public ISerialization +{ +public: + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationTuple.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationTuple.cpp new file mode 100644 index 00000000000..7f3e7619b0d --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationTuple.cpp @@ -0,0 +1,484 @@ +#include <DataTypes/Serializations/SerializationTuple.h> +#include <DataTypes/Serializations/SerializationNullable.h> +#include <DataTypes/Serializations/SerializationInfoTuple.h> +#include <DataTypes/DataTypeTuple.h> +#include <Core/Field.h> +#include <Columns/ColumnTuple.h> +#include <Common/assert_cast.h> +#include <IO/WriteHelpers.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteBufferFromString.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH; + extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int INCORRECT_DATA; +} + + +static inline IColumn & extractElementColumn(IColumn & column, size_t idx) +{ + return assert_cast<ColumnTuple &>(column).getColumn(idx); +} + +static inline const IColumn & extractElementColumn(const IColumn & column, size_t idx) +{ + return assert_cast<const ColumnTuple &>(column).getColumn(idx); +} + +void SerializationTuple::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & tuple = field.get<const Tuple &>(); + for (size_t element_index = 0; element_index < elems.size(); ++element_index) + { + const auto & serialization = elems[element_index]; + serialization->serializeBinary(tuple[element_index], ostr, settings); + } +} + +void SerializationTuple::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + const size_t size = elems.size(); + + field = Tuple(); + Tuple & tuple = field.get<Tuple &>(); + tuple.reserve(size); + for (size_t i = 0; i < size; ++i) + elems[i]->deserializeBinary(tuple.emplace_back(), istr, settings); +} + +void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + for (size_t element_index = 0; element_index < elems.size(); ++element_index) + { + const auto & serialization = elems[element_index]; + serialization->serializeBinary(extractElementColumn(column, element_index), row_num, ostr, settings); + } +} + + +template <typename F> +static void addElementSafe(size_t num_elems, IColumn & column, F && impl) +{ + /// We use the assumption that tuples of zero size do not exist. + size_t old_size = column.size(); + + try + { + impl(); + + // Check that all columns now have the same size. + size_t new_size = column.size(); + for (size_t i = 1; i < num_elems; ++i) + { + const auto & element_column = extractElementColumn(column, i); + if (element_column.size() != new_size) + { + // This is not a logical error because it may work with + // user-supplied data. + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, + "Cannot read a tuple because not all elements are present"); + } + } + } + catch (...) + { + for (size_t i = 0; i < num_elems; ++i) + { + auto & element_column = extractElementColumn(column, i); + if (element_column.size() > old_size) + element_column.popBack(1); + } + + throw; + } +} + +void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + addElementSafe(elems.size(), column, [&] + { + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->deserializeBinary(extractElementColumn(column, i), istr, settings); + }); +} + +void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('(', ostr); + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar(')', ostr); +} + +void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + const size_t size = elems.size(); + assertChar('(', istr); + + addElementSafe(elems.size(), column, [&] + { + for (size_t i = 0; i < size; ++i) + { + skipWhitespaceIfAny(istr); + if (i != 0) + { + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextQuoted(extractElementColumn(column, i), istr, settings); + } + + // Special format for one element tuple (1,) + if (1 == elems.size()) + { + skipWhitespaceIfAny(istr); + // Allow both (1) and (1,) + checkChar(',', istr); + } + + skipWhitespaceIfAny(istr); + assertChar(')', istr); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); + }); +} + +void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + if (settings.json.write_named_tuples_as_objects + && have_explicit_names) + { + writeChar('{', ostr); + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + { + writeChar(',', ostr); + } + writeJSONString(elems[i]->getElementName(), ostr, settings); + writeChar(':', ostr); + elems[i]->serializeTextJSON(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar('}', ostr); + } + else + { + writeChar('[', ostr); + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + writeChar(',', ostr); + elems[i]->serializeTextJSON(extractElementColumn(column, i), row_num, ostr, settings); + } + writeChar(']', ostr); + } +} + +void SerializationTuple::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const +{ + if (settings.json.write_named_tuples_as_objects + && have_explicit_names) + { + writeCString("{\n", ostr); + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + writeCString(",\n", ostr); + writeChar(' ', (indent + 1) * 4, ostr); + writeJSONString(elems[i]->getElementName(), ostr, settings); + writeCString(": ", ostr); + elems[i]->serializeTextJSONPretty(extractElementColumn(column, i), row_num, ostr, settings, indent + 1); + } + writeChar('\n', ostr); + writeChar(' ', indent * 4, ostr); + writeChar('}', ostr); + } + else + { + writeCString("[\n", ostr); + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + writeCString(",\n", ostr); + writeChar(' ', (indent + 1) * 4, ostr); + elems[i]->serializeTextJSONPretty(extractElementColumn(column, i), row_num, ostr, settings, indent + 1); + } + writeChar('\n', ostr); + writeChar(' ', indent * 4, ostr); + writeChar(']', ostr); + } +} + +void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.json.read_named_tuples_as_objects + && have_explicit_names) + { + skipWhitespaceIfAny(istr); + assertChar('{', istr); + skipWhitespaceIfAny(istr); + + addElementSafe(elems.size(), column, [&] + { + std::vector<UInt8> seen_elements(elems.size(), 0); + size_t processed = 0; + size_t skipped = 0; + while (!istr.eof() && *istr.position() != '}') + { + if (!settings.json.ignore_unknown_keys_in_named_tuple && processed == elems.size()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); + + if (processed + skipped > 0) + { + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + + std::string name; + readDoubleQuotedString(name, istr); + skipWhitespaceIfAny(istr); + assertChar(':', istr); + skipWhitespaceIfAny(istr); + + const size_t element_pos = getPositionByName(name); + if (element_pos == std::numeric_limits<size_t>::max()) + { + if (settings.json.ignore_unknown_keys_in_named_tuple) + { + skipJSONField(istr, name); + skipWhitespaceIfAny(istr); + ++skipped; + continue; + } + else + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); + } + + seen_elements[element_pos] = 1; + auto & element_column = extractElementColumn(column, element_pos); + + try + { + if (settings.null_as_default) + SerializationNullable::deserializeTextJSONImpl(element_column, istr, settings, elems[element_pos]); + else + elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + } + catch (Exception & e) + { + e.addMessage("(while reading the value of nested key " + name + ")"); + throw; + } + + skipWhitespaceIfAny(istr); + ++processed; + } + + assertChar('}', istr); + + /// Check if we have missing elements. + if (processed != elems.size()) + { + for (size_t element_pos = 0; element_pos != seen_elements.size(); ++element_pos) + { + if (seen_elements[element_pos]) + continue; + + if (!settings.json.defaults_for_missing_elements_in_named_tuple) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " + "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", + elems[element_pos]->getElementName()); + + auto & element_column = extractElementColumn(column, element_pos); + element_column.insertDefault(); + } + } + }); + } + else + { + assertChar('[', istr); + + addElementSafe(elems.size(), column, [&] + { + for (size_t i = 0; i < elems.size(); ++i) + { + skipWhitespaceIfAny(istr); + if (i != 0) + { + assertChar(',', istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextJSON(extractElementColumn(column, i), istr, settings); + } + + skipWhitespaceIfAny(istr); + assertChar(']', istr); + }); + } +} + +void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeCString("<tuple>", ostr); + for (size_t i = 0; i < elems.size(); ++i) + { + writeCString("<elem>", ostr); + elems[i]->serializeTextXML(extractElementColumn(column, i), row_num, ostr, settings); + writeCString("</elem>", ostr); + } + writeCString("</tuple>", ostr); +} + +void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + writeChar(settings.csv.tuple_delimiter, ostr); + elems[i]->serializeTextCSV(extractElementColumn(column, i), row_num, ostr, settings); + } +} + +void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + assertChar(settings.csv.tuple_delimiter, istr); + skipWhitespaceIfAny(istr); + } + elems[i]->deserializeTextCSV(extractElementColumn(column, i), istr, settings); + } + }); +} + +void SerializationTuple::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_tuple = data.type ? &assert_cast<const DataTypeTuple &>(*data.type) : nullptr; + const auto * column_tuple = data.column ? &assert_cast<const ColumnTuple &>(*data.column) : nullptr; + const auto * info_tuple = data.serialization_info ? &assert_cast<const SerializationInfoTuple &>(*data.serialization_info) : nullptr; + + for (size_t i = 0; i < elems.size(); ++i) + { + auto next_data = SubstreamData(elems[i]) + .withType(type_tuple ? type_tuple->getElement(i) : nullptr) + .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr) + .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr); + + elems[i]->enumerateStreams(settings, callback, next_data); + } +} + +struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState +{ + std::vector<ISerialization::SerializeBinaryBulkStatePtr> states; +}; + +struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinaryBulkState +{ + std::vector<ISerialization::DeserializeBinaryBulkStatePtr> states; +}; + + +void SerializationTuple::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto tuple_state = std::make_shared<SerializeBinaryBulkStateTuple>(); + tuple_state->states.resize(elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->serializeBinaryBulkStatePrefix(extractElementColumn(column, i), settings, tuple_state->states[i]); + + state = std::move(tuple_state); +} + +void SerializationTuple::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * tuple_state = checkAndGetState<SerializeBinaryBulkStateTuple>(state); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]); +} + +void SerializationTuple::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto tuple_state = std::make_shared<DeserializeBinaryBulkStateTuple>(); + tuple_state->states.resize(elems.size()); + + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + + state = std::move(tuple_state); +} + +void SerializationTuple::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * tuple_state = checkAndGetState<SerializeBinaryBulkStateTuple>(state); + + for (size_t i = 0; i < elems.size(); ++i) + { + const auto & element_col = extractElementColumn(column, i); + elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]); + } +} + +void SerializationTuple::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto * tuple_state = checkAndGetState<DeserializeBinaryBulkStateTuple>(state); + + auto mutable_column = column->assumeMutable(); + auto & column_tuple = assert_cast<ColumnTuple &>(*mutable_column); + + settings.avg_value_size_hint = 0; + for (size_t i = 0; i < elems.size(); ++i) + elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache); +} + +size_t SerializationTuple::getPositionByName(const String & name) const +{ + size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + if (elems[i]->getElementName() == name) + return i; + return std::numeric_limits<size_t>::max(); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationTuple.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationTuple.h new file mode 100644 index 00000000000..7325259f440 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationTuple.h @@ -0,0 +1,78 @@ +#pragma once + +#include <DataTypes/Serializations/SimpleTextSerialization.h> +#include <DataTypes/Serializations/SerializationNamed.h> + +namespace DB +{ + +class SerializationTuple final : public SimpleTextSerialization +{ +public: + using ElementSerializationPtr = std::shared_ptr<const SerializationNamed>; + using ElementSerializations = std::vector<ElementSerializationPtr>; + + SerializationTuple(const ElementSerializations & elems_, bool have_explicit_names_) + : elems(elems_), have_explicit_names(have_explicit_names_) + { + } + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + + /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + /** Each sub-column in a tuple is serialized in separate stream. + */ + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + const ElementSerializations & getElementsSerializations() const { return elems; } + +private: + ElementSerializations elems; + bool have_explicit_names; + + size_t getPositionByName(const String & name) const; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationUUID.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationUUID.cpp new file mode 100644 index 00000000000..613a16541f5 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationUUID.cpp @@ -0,0 +1,173 @@ +#include <Columns/ColumnsNumber.h> +#include <DataTypes/Serializations/SerializationUUID.h> +#include <Formats/ProtobufReader.h> +#include <Formats/ProtobufWriter.h> +#include <IO/ReadBufferFromString.h> +#include <IO/ReadHelpers.h> +#include <IO/WriteHelpers.h> +#include <Common/assert_cast.h> + +#include <ranges> + +namespace DB +{ + +void SerializationUUID::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeText(assert_cast<const ColumnUUID &>(column).getData()[row_num], ostr); +} + +void SerializationUUID::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + UUID x; + readText(x, istr); + assert_cast<ColumnUUID &>(column).getData().push_back(x); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "UUID"); +} + +void SerializationUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeText(column, istr, settings, false); +} + +void SerializationUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + +void SerializationUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID uuid; + bool fast = false; + if (istr.available() >= 38) + { + assertChar('\'', istr); + char * next_pos = find_first_symbols<'\\', '\''>(istr.position(), istr.buffer().end()); + const size_t len = next_pos - istr.position(); + if ((len == 32 || len == 36) && istr.position()[len] == '\'') + { + uuid = parseUUID(std::span(reinterpret_cast<const UInt8 *>(istr.position()), len)); + istr.ignore(len + 1); + fast = true; + } + else + { + // It's ok to go back in the position because we haven't read from the buffer except the first char + // and we know there were at least 38 bytes available (so no new read has been triggered) + istr.position()--; + } + } + + if (!fast) + { + String quoted_chars; + readQuotedStringInto<false>(quoted_chars, istr); + ReadBufferFromString parsed_quoted_buffer(quoted_chars); + readText(uuid, parsed_quoted_buffer); + } + + assert_cast<ColumnUUID &>(column).getData().push_back(std::move(uuid)); /// It's important to do this at the end - for exception safety. +} + +void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + assertChar('"', istr); + readText(x, istr); + assertChar('"', istr); + assert_cast<ColumnUUID &>(column).getData().push_back(x); +} + +void SerializationUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +void SerializationUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID value; + readCSV(value, istr); + assert_cast<ColumnUUID &>(column).getData().push_back(value); +} + + +void SerializationUUID::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + UUID x = field.get<UUID>(); + writeBinaryLittleEndian(x, ostr); +} + +void SerializationUUID::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + readBinaryLittleEndian(x, istr); + field = NearestFieldType<UUID>(x); +} + +void SerializationUUID::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +{ + writeBinaryLittleEndian(assert_cast<const ColumnVector<UUID> &>(column).getData()[row_num], ostr); +} + +void SerializationUUID::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + readBinaryLittleEndian(x, istr); + assert_cast<ColumnVector<UUID> &>(column).getData().push_back(x); +} + +void SerializationUUID::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnVector<UUID>::Container & x = typeid_cast<const ColumnVector<UUID> &>(column).getData(); + if (const size_t size = x.size(); limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit == 0) + return; + + if constexpr (std::endian::native == std::endian::big) + { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunreachable-code" + for (size_t i = offset; i < offset + limit; ++i) + writeBinaryLittleEndian(x[i], ostr); +#pragma clang diagnostic pop + } + else + ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(UUID) * limit); +} + +void SerializationUUID::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const +{ + typename ColumnVector<UUID>::Container & x = typeid_cast<ColumnVector<UUID> &>(column).getData(); + const size_t initial_size = x.size(); + x.resize(initial_size + limit); + const size_t size = istr.readBig(reinterpret_cast<char *>(&x[initial_size]), sizeof(UUID) * limit); + x.resize(initial_size + size / sizeof(UUID)); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunreachable-code" + if constexpr (std::endian::native == std::endian::big) + for (size_t i = initial_size; i < x.size(); ++i) + transformEndianness<std::endian::big, std::endian::little>(x[i]); +#pragma clang diagnostic pop +} +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationUUID.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationUUID.h new file mode 100644 index 00000000000..da8c15f7279 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationUUID.h @@ -0,0 +1,30 @@ +#pragma once + +#include <DataTypes/Serializations/SerializationNumber.h> + +namespace DB +{ + +class SerializationUUID : public SimpleTextSerialization +{ +public: + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationWrapper.cpp b/contrib/clickhouse/src/DataTypes/Serializations/SerializationWrapper.cpp new file mode 100644 index 00000000000..18e4891ee65 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -0,0 +1,149 @@ +#include <DataTypes/Serializations/SerializationWrapper.h> +#include <Columns/IColumn.h> + +namespace DB +{ + +void SerializationWrapper::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + nested_serialization->enumerateStreams(settings, callback, data); +} + +void SerializationWrapper::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested_serialization->serializeBinaryBulkStatePrefix(column, settings, state); +} + +void SerializationWrapper::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + nested_serialization->serializeBinaryBulkStateSuffix(settings, state); +} + +void SerializationWrapper::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); +} + +void SerializationWrapper::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + + nested_serialization->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state); +} + +void SerializationWrapper::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + + nested_serialization->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache); +} + +void SerializationWrapper::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const +{ + nested_serialization->serializeBinaryBulk(column, ostr, offset, limit); +} + +void SerializationWrapper::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const +{ + nested_serialization->deserializeBinaryBulk(column, istr, limit, avg_value_size_hint); +} + +void SerializationWrapper::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeBinary(field, ostr, settings); +} + +void SerializationWrapper::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeBinary(field, istr, settings); +} + +void SerializationWrapper::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeBinary(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeBinary(column, istr, settings); +} + +void SerializationWrapper::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextEscaped(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextEscaped(column, istr, settings); +} + +void SerializationWrapper::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextQuoted(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextQuoted(column, istr, settings); +} + +void SerializationWrapper::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextCSV(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextCSV(column, istr, settings); +} + +void SerializationWrapper::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeText(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeWholeText(column, istr, settings); +} + +void SerializationWrapper::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextJSON(column, row_num, ostr, settings); +} + +void SerializationWrapper::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + nested_serialization->deserializeTextJSON(column, istr, settings); +} + +void SerializationWrapper::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const +{ + nested_serialization->serializeTextJSONPretty(column, row_num, ostr, settings, indent); +} + +void SerializationWrapper::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + nested_serialization->serializeTextXML(column, row_num, ostr, settings); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SerializationWrapper.h b/contrib/clickhouse/src/DataTypes/Serializations/SerializationWrapper.h new file mode 100644 index 00000000000..31900f93148 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SerializationWrapper.h @@ -0,0 +1,83 @@ +#pragma once + +#include <DataTypes/Serializations/ISerialization.h> +#include <Common/Exception.h> + +namespace DB +{ + +/// Wrapper for serialization, which calls methods, which are not overridden, from nested serialization. +/// You can inherit this class, when you need to override bunch of methods, to avoid boilerplate code. +class SerializationWrapper : public ISerialization +{ +protected: + SerializationPtr nested_serialization; + +public: + explicit SerializationWrapper(const SerializationPtr & nested_serialization_) : nested_serialization(nested_serialization_) {} + + const SerializationPtr & getNested() const { return nested_serialization; } + + Kind getKind() const override { return nested_serialization->getKind(); } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SimpleTextSerialization.h b/contrib/clickhouse/src/DataTypes/Serializations/SimpleTextSerialization.h new file mode 100644 index 00000000000..0247f30b30a --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SimpleTextSerialization.h @@ -0,0 +1,64 @@ +#pragma once +#include <DataTypes/Serializations/ISerialization.h> + +namespace DB +{ + +/// Helper class to define same ISerialization text (de)serialization for all the variants (escaped, quoted, JSON, CSV). +/// You need to define serializeText() and deserializeText() in derived class. +class SimpleTextSerialization : public ISerialization +{ +protected: + SimpleTextSerialization() = default; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + serializeText(column, row_num, ostr, settings); + } + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + serializeText(column, row_num, ostr, settings); + } + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + serializeText(column, row_num, ostr, settings); + } + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override + { + serializeText(column, row_num, ostr, settings); + } + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + deserializeText(column, istr, settings, true); + } + + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + deserializeText(column, istr, settings, false); + } + + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + deserializeText(column, istr, settings, false); + } + + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + deserializeText(column, istr, settings, false); + } + + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + deserializeText(column, istr, settings, false); + } + + /// whole = true means that buffer contains only one value, so we should read until EOF. + /// It's needed to check if there is garbage after parsed field. + virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/Serializations/SubcolumnsTree.h b/contrib/clickhouse/src/DataTypes/Serializations/SubcolumnsTree.h new file mode 100644 index 00000000000..fda45e1e9a2 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/Serializations/SubcolumnsTree.h @@ -0,0 +1,209 @@ +#pragma once + +#include <DataTypes/Serializations/PathInData.h> +#include <DataTypes/IDataType.h> +#include <Columns/IColumn.h> +#include <Common/HashTable/HashMap.h> + +namespace DB +{ + +/// Tree that represents paths in document +/// with additional data in nodes. +template <typename NodeData> +class SubcolumnsTree +{ +public: + struct Node + { + enum Kind + { + TUPLE, + NESTED, + SCALAR, + }; + + explicit Node(Kind kind_) : kind(kind_) {} + Node(Kind kind_, const NodeData & data_) : kind(kind_), data(data_) {} + Node(Kind kind_, const NodeData & data_, const PathInData & path_) + : kind(kind_), data(data_), path(path_) {} + + Kind kind = TUPLE; + const Node * parent = nullptr; + + Arena strings_pool; + HashMapWithStackMemory<StringRef, std::shared_ptr<Node>, StringRefHash, 4> children; + + NodeData data; + PathInData path; + + bool isNested() const { return kind == NESTED; } + bool isScalar() const { return kind == SCALAR; } + + void addChild(std::string_view key, std::shared_ptr<Node> next_node) + { + next_node->parent = this; + StringRef key_ref{strings_pool.insert(key.data(), key.length()), key.length()}; + children[key_ref] = std::move(next_node); + } + }; + + using NodeKind = typename Node::Kind; + using NodePtr = std::shared_ptr<Node>; + + SubcolumnsTree() : root(std::make_shared<Node>(Node::TUPLE)) {} + + /// Add a leaf without any data in other nodes. + bool add(const PathInData & path, const NodeData & leaf_data) + { + return add(path, [&](NodeKind kind, bool exists) -> NodePtr + { + if (exists) + return nullptr; + + if (kind == Node::SCALAR) + return std::make_shared<Node>(kind, leaf_data, path); + + return std::make_shared<Node>(kind); + }); + } + + /// Callback for creation of node. Receives kind of node and + /// flag, which is true if node already exists. + using NodeCreator = std::function<NodePtr(NodeKind, bool)>; + + bool add(const PathInData & path, const NodeCreator & node_creator) + { + const auto & parts = path.getParts(); + if (parts.empty()) + return false; + + Node * current_node = root.get(); + for (size_t i = 0; i < parts.size() - 1; ++i) + { + assert(current_node->kind != Node::SCALAR); + + auto it = current_node->children.find(StringRef{parts[i].key}); + if (it != current_node->children.end()) + { + current_node = it->getMapped().get(); + node_creator(current_node->kind, true); + + if (current_node->isNested() != parts[i].is_nested) + return false; + } + else + { + auto next_kind = parts[i].is_nested ? Node::NESTED : Node::TUPLE; + auto next_node = node_creator(next_kind, false); + current_node->addChild(String(parts[i].key), next_node); + current_node = next_node.get(); + } + } + + auto it = current_node->children.find(StringRef{parts.back().key}); + if (it != current_node->children.end()) + return false; + + auto next_node = node_creator(Node::SCALAR, false); + current_node->addChild(String(parts.back().key), next_node); + leaves.push_back(std::move(next_node)); + + return true; + } + + /// Find node that matches the path the best. + const Node * findBestMatch(const PathInData & path) const + { + return findImpl(path, false); + } + + /// Find node that matches the path exactly. + const Node * findExact(const PathInData & path) const + { + return findImpl(path, true); + } + + /// Find leaf by path. + const Node * findLeaf(const PathInData & path) const + { + const auto * candidate = findExact(path); + if (!candidate || !candidate->isScalar()) + return nullptr; + return candidate; + } + + using NodePredicate = std::function<bool(const Node &)>; + + /// Finds leaf that satisfies the predicate. + const Node * findLeaf(const NodePredicate & predicate) + { + return findLeaf(root.get(), predicate); + } + + static const Node * findLeaf(const Node * node, const NodePredicate & predicate) + { + if (!node) + return nullptr; + + if (node->isScalar()) + return predicate(*node) ? node : nullptr; + + for (const auto & [_, child] : node->children) + if (const auto * leaf = findLeaf(child.get(), predicate)) + return leaf; + + return nullptr; + } + + /// Find first parent node that satisfies the predicate. + static const Node * findParent(const Node * node, const NodePredicate & predicate) + { + while (node && !predicate(*node)) + node = node->parent; + return node; + } + + bool empty() const { return root->children.empty(); } + size_t size() const { return leaves.size(); } + + using Nodes = std::vector<NodePtr>; + + const Nodes & getLeaves() const { return leaves; } + const Node & getRoot() const { return *root; } + + using iterator = typename Nodes::iterator; + using const_iterator = typename Nodes::const_iterator; + + iterator begin() { return leaves.begin(); } + iterator end() { return leaves.end(); } + + const_iterator begin() const { return leaves.begin(); } + const_iterator end() const { return leaves.end(); } + +private: + const Node * findImpl(const PathInData & path, bool find_exact) const + { + if (empty()) + return nullptr; + + const auto & parts = path.getParts(); + const auto * current_node = root.get(); + + for (const auto & part : parts) + { + auto it = current_node->children.find(StringRef{part.key}); + if (it == current_node->children.end()) + return find_exact ? nullptr : current_node; + + current_node = it->getMapped().get(); + } + + return current_node; + } + + NodePtr root; + Nodes leaves; +}; + +} diff --git a/contrib/clickhouse/src/DataTypes/TimezoneMixin.h b/contrib/clickhouse/src/DataTypes/TimezoneMixin.h new file mode 100644 index 00000000000..03ecde5dd0a --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/TimezoneMixin.h @@ -0,0 +1,32 @@ +#pragma once +#include <Core/Types.h> +#include <Common/DateLUT.h> + +class DateLUTImpl; + +/** Mixin-class that manages timezone info for timezone-aware DateTime implementations + * + * Must be used as a (second) base for class implementing IDateType/ISerialization-interface. + */ +class TimezoneMixin +{ +public: + TimezoneMixin(const TimezoneMixin &) = default; + + explicit TimezoneMixin(const String & time_zone_name = "") + : has_explicit_time_zone(!time_zone_name.empty()) + , time_zone(DateLUT::instance(time_zone_name)) + , utc_time_zone(DateLUT::instance("UTC")) + { + } + + const DateLUTImpl & getTimeZone() const { return time_zone; } + bool hasExplicitTimeZone() const { return has_explicit_time_zone; } + +protected: + /// true if time zone name was provided in data type parameters, false if it's using default time zone. + bool has_explicit_time_zone; + + const DateLUTImpl & time_zone; + const DateLUTImpl & utc_time_zone; +}; diff --git a/contrib/clickhouse/src/DataTypes/convertMySQLDataType.cpp b/contrib/clickhouse/src/DataTypes/convertMySQLDataType.cpp new file mode 100644 index 00000000000..bb848bf1526 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/convertMySQLDataType.cpp @@ -0,0 +1,132 @@ +#include "convertMySQLDataType.h" + +#include <Core/Field.h> +#include <base/types.h> +#include <Core/MultiEnum.h> +#include <Core/SettingsEnums.h> +#include <Parsers/ASTFunction.h> +#include <Parsers/IAST.h> +#include "DataTypeDate.h" +#include "DataTypeDate32.h" +#include "DataTypeDateTime.h" +#include "DataTypeDateTime64.h" +#include "DataTypeEnum.h" +#include "DataTypesDecimal.h" +#include "DataTypeFixedString.h" +#include "DataTypeNullable.h" +#include "DataTypeString.h" +#include "DataTypesNumber.h" +#include "IDataType.h" + +namespace DB +{ + +DataTypePtr convertMySQLDataType(MultiEnum<MySQLDataTypesSupport> type_support, + const std::string & mysql_data_type, + bool is_nullable, + bool is_unsigned, + size_t length, + size_t precision, + size_t scale) +{ + // Mysql returns mysql_data_type as below: + // 1. basic_type + // 2. basic_type options + // 3. type_with_params(param1, param2, ...) + // 4. type_with_params(param1, param2, ...) options + // The options can be unsigned, zerofill, or some other strings. + auto data_type = std::string_view(mysql_data_type); + const auto type_end_pos = data_type.find_first_of(R"(( )"); // FIXME: fix style-check script instead + const auto type_name = data_type.substr(0, type_end_pos); + + DataTypePtr res; + + if (type_name == "tinyint") + { + if (is_unsigned) + res = std::make_shared<DataTypeUInt8>(); + else + res = std::make_shared<DataTypeInt8>(); + } + else if (type_name == "smallint") + { + if (is_unsigned) + res = std::make_shared<DataTypeUInt16>(); + else + res = std::make_shared<DataTypeInt16>(); + } + else if (type_name == "int" || type_name == "mediumint" || type_name == "integer") + { + if (is_unsigned) + res = std::make_shared<DataTypeUInt32>(); + else + res = std::make_shared<DataTypeInt32>(); + } + else if (type_name == "bigint") + { + if (is_unsigned) + res = std::make_shared<DataTypeUInt64>(); + else + res = std::make_shared<DataTypeInt64>(); + } + else if (type_name == "float") + res = std::make_shared<DataTypeFloat32>(); + else if (type_name == "double") + res = std::make_shared<DataTypeFloat64>(); + else if (type_name == "date") + { + if (type_support.isSet(MySQLDataTypesSupport::DATE2DATE32)) + res = std::make_shared<DataTypeDate32>(); + else if (type_support.isSet(MySQLDataTypesSupport::DATE2STRING)) + res = std::make_shared<DataTypeString>(); + else + res = std::make_shared<DataTypeDate>(); + } + else if (type_name == "binary") + { + //compatible with binary(0) DataType + if (length == 0) length = 1; + res = std::make_shared<DataTypeFixedString>(length); + } + else if (type_name == "datetime" || type_name == "timestamp") + { + if (!type_support.isSet(MySQLDataTypesSupport::DATETIME64)) + { + res = std::make_shared<DataTypeDateTime>(); + } + else if (type_name == "timestamp" && scale == 0) + { + res = std::make_shared<DataTypeDateTime>(); + } + else if (type_name == "datetime" || type_name == "timestamp") + { + res = std::make_shared<DataTypeDateTime64>(scale); + } + } + else if (type_name == "bit") + { + res = std::make_shared<DataTypeUInt64>(); + } + else if (type_support.isSet(MySQLDataTypesSupport::DECIMAL) && (type_name == "numeric" || type_name == "decimal")) + { + if (precision <= DecimalUtils::max_precision<Decimal32>) + res = std::make_shared<DataTypeDecimal<Decimal32>>(precision, scale); + else if (precision <= DecimalUtils::max_precision<Decimal64>) + res = std::make_shared<DataTypeDecimal<Decimal64>>(precision, scale); + else if (precision <= DecimalUtils::max_precision<Decimal128>) + res = std::make_shared<DataTypeDecimal<Decimal128>>(precision, scale); + else if (precision <= DecimalUtils::max_precision<Decimal256>) + res = std::make_shared<DataTypeDecimal<Decimal256>>(precision, scale); + } + + /// Also String is fallback for all unknown types. + if (!res) + res = std::make_shared<DataTypeString>(); + + if (is_nullable) + res = std::make_shared<DataTypeNullable>(res); + + return res; +} + +} diff --git a/contrib/clickhouse/src/DataTypes/convertMySQLDataType.h b/contrib/clickhouse/src/DataTypes/convertMySQLDataType.h new file mode 100644 index 00000000000..543119bc60e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/convertMySQLDataType.h @@ -0,0 +1,15 @@ +#pragma once + +#include <string> +#include <Core/MultiEnum.h> +#include <Parsers/IAST.h> +#include "IDataType.h" + +namespace DB +{ +enum class MySQLDataTypesSupport; + +/// Convert MySQL type to ClickHouse data type. +DataTypePtr convertMySQLDataType(MultiEnum<MySQLDataTypesSupport> type_support, const std::string & mysql_data_type, bool is_nullable, bool is_unsigned, size_t length, size_t precision, size_t scale); + +} diff --git a/contrib/clickhouse/src/DataTypes/getLeastSupertype.cpp b/contrib/clickhouse/src/DataTypes/getLeastSupertype.cpp new file mode 100644 index 00000000000..9d42d82ce91 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/getLeastSupertype.cpp @@ -0,0 +1,668 @@ +#include <unordered_set> + +#include <IO/WriteBufferFromString.h> +#include <IO/Operators.h> +#include <Common/typeid_cast.h> + +#include <DataTypes/getLeastSupertype.h> + +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeLowCardinality.h> +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeDateTime.h> +#include <DataTypes/DataTypeDateTime64.h> +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/DataTypesDecimal.h> +#include <DataTypes/DataTypeFactory.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NO_COMMON_TYPE; +} + +namespace +{ + +String typeToString(const DataTypePtr & type) { return type->getName(); } +String typeToString(const TypeIndex & type) { return String(magic_enum::enum_name(type)); } + +template <typename DataTypes> +String getExceptionMessagePrefix(const DataTypes & types) +{ + WriteBufferFromOwnString res; + + bool first = true; + for (const auto & type : types) + { + if (!first) + res << ", "; + first = false; + + res << typeToString(type); + } + + return res.str(); +} + +template <LeastSupertypeOnError on_error, typename DataTypes> +DataTypePtr throwOrReturn(const DataTypes & types, std::string_view message_suffix, int error_code) +{ + if constexpr (on_error == LeastSupertypeOnError::String) + return std::make_shared<DataTypeString>(); + + if constexpr (on_error == LeastSupertypeOnError::Null) + return nullptr; + + if (message_suffix.empty()) + throw Exception(error_code, "There is no supertype for types {}", getExceptionMessagePrefix(types)); + + throw Exception(error_code, "There is no supertype for types {} {}", getExceptionMessagePrefix(types), message_suffix); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr getNumericType(const TypeIndexSet & types) +{ + bool all_numbers = true; + + size_t max_bits_of_signed_integer = 0; + size_t max_bits_of_unsigned_integer = 0; + size_t max_mantissa_bits_of_floating = 0; + + auto maximize = [](size_t & what, size_t value) + { + if (value > what) + what = value; + }; + + for (const auto & type : types) + { + if (type == TypeIndex::UInt8) + maximize(max_bits_of_unsigned_integer, 8); + else if (type == TypeIndex::UInt16) + maximize(max_bits_of_unsigned_integer, 16); + else if (type == TypeIndex::UInt32 || type == TypeIndex::IPv4) + maximize(max_bits_of_unsigned_integer, 32); + else if (type == TypeIndex::UInt64) + maximize(max_bits_of_unsigned_integer, 64); + else if (type == TypeIndex::UInt128) + maximize(max_bits_of_unsigned_integer, 128); + else if (type == TypeIndex::UInt256) + maximize(max_bits_of_unsigned_integer, 256); + else if (type == TypeIndex::Int8 || type == TypeIndex::Enum8) + maximize(max_bits_of_signed_integer, 8); + else if (type == TypeIndex::Int16 || type == TypeIndex::Enum16) + maximize(max_bits_of_signed_integer, 16); + else if (type == TypeIndex::Int32) + maximize(max_bits_of_signed_integer, 32); + else if (type == TypeIndex::Int64) + maximize(max_bits_of_signed_integer, 64); + else if (type == TypeIndex::Int128) + maximize(max_bits_of_signed_integer, 128); + else if (type == TypeIndex::Int256) + maximize(max_bits_of_signed_integer, 256); + else if (type == TypeIndex::Float32) + maximize(max_mantissa_bits_of_floating, 24); + else if (type == TypeIndex::Float64) + maximize(max_mantissa_bits_of_floating, 53); + else if (type != TypeIndex::Nothing) + all_numbers = false; + } + + if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating) + { + if (!all_numbers) + return throwOrReturn<on_error>(types, "because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit. + /// Example, common of Int32, UInt32 = Int64. + + size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer); + + /// If unsigned is not covered by signed. + if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) + { + // Because 128 and 256 bit integers are significantly slower, we should not promote to them. + // But if we already have wide numbers, promotion is necessary. + if (min_bit_width_of_integer != 64) + ++min_bit_width_of_integer; + else + return throwOrReturn<on_error>(types, + "because some of them are signed integers and some are unsigned integers," + " but there is no signed integer type, that can exactly represent all required unsigned integer values", + ErrorCodes::NO_COMMON_TYPE); + } + + /// If the result must be floating. + if (max_mantissa_bits_of_floating) + { + size_t min_mantissa_bits = std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating); + if (min_mantissa_bits <= 24) + return std::make_shared<DataTypeFloat32>(); + else if (min_mantissa_bits <= 53) + return std::make_shared<DataTypeFloat64>(); + else + return throwOrReturn<on_error>(types, + " because some of them are integers and some are floating point," + " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE); + } + + /// If the result must be signed integer. + if (max_bits_of_signed_integer) + { + if (min_bit_width_of_integer <= 8) + return std::make_shared<DataTypeInt8>(); + else if (min_bit_width_of_integer <= 16) + return std::make_shared<DataTypeInt16>(); + else if (min_bit_width_of_integer <= 32) + return std::make_shared<DataTypeInt32>(); + else if (min_bit_width_of_integer <= 64) + return std::make_shared<DataTypeInt64>(); + else if (min_bit_width_of_integer <= 128) + return std::make_shared<DataTypeInt128>(); + else if (min_bit_width_of_integer <= 256) + return std::make_shared<DataTypeInt256>(); + else + return throwOrReturn<on_error>(types, + " because some of them are signed integers and some are unsigned integers," + " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); + } + + /// All unsigned. + { + if (min_bit_width_of_integer <= 8) + return std::make_shared<DataTypeUInt8>(); + else if (min_bit_width_of_integer <= 16) + return std::make_shared<DataTypeUInt16>(); + else if (min_bit_width_of_integer <= 32) + return std::make_shared<DataTypeUInt32>(); + else if (min_bit_width_of_integer <= 64) + return std::make_shared<DataTypeUInt64>(); + else if (min_bit_width_of_integer <= 128) + return std::make_shared<DataTypeUInt128>(); + else if (min_bit_width_of_integer <= 256) + return std::make_shared<DataTypeUInt256>(); + else + return throwOrReturn<on_error>(types, + " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE); + } + } + + return {}; +} + +} + +template <LeastSupertypeOnError on_error> +DataTypePtr getLeastSupertype(const DataTypes & types) +{ + /// Trivial cases + + if (types.empty()) + return std::make_shared<DataTypeNothing>(); + + if (types.size() == 1) + return types[0]; + + /// All types are equal + { + bool all_equal = true; + for (size_t i = 1, size = types.size(); i < size; ++i) + { + if (!types[i]->equals(*types[0])) + { + all_equal = false; + break; + } + } + + if (all_equal) + return types[0]; + } + + /// Recursive rules + + /// If there are Nothing types, skip them + { + DataTypes non_nothing_types; + non_nothing_types.reserve(types.size()); + + for (const auto & type : types) + if (!typeid_cast<const DataTypeNothing *>(type.get())) + non_nothing_types.emplace_back(type); + + if (non_nothing_types.size() < types.size()) + return getLeastSupertype<on_error>(non_nothing_types); + } + + /// For Arrays + { + bool have_array = false; + bool all_arrays = true; + + DataTypes nested_types; + nested_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(type.get())) + { + have_array = true; + nested_types.emplace_back(type_array->getNestedType()); + } + else + all_arrays = false; + } + + if (have_array) + { + if (!all_arrays) + return throwOrReturn<on_error>(types, "because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + auto nested_type = getLeastSupertype<on_error>(nested_types); + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, + /// nested_type will be nullptr, we should return nullptr in this case. + if (!nested_type) + return nullptr; + + return std::make_shared<DataTypeArray>(nested_type); + } + } + + /// For tuples + { + bool have_tuple = false; + bool all_tuples = true; + size_t tuple_size = 0; + + std::vector<DataTypes> nested_types; + + for (const auto & type : types) + { + if (const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(type.get())) + { + if (!have_tuple) + { + tuple_size = type_tuple->getElements().size(); + nested_types.resize(tuple_size); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].reserve(types.size()); + } + else if (tuple_size != type_tuple->getElements().size()) + return throwOrReturn<on_error>(types, "because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE); + + have_tuple = true; + + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].emplace_back(type_tuple->getElements()[elem_idx]); + } + else + all_tuples = false; + } + + if (have_tuple) + { + if (!all_tuples) + return throwOrReturn<on_error>(types, "because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + DataTypes common_tuple_types(tuple_size); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + { + auto common_type = getLeastSupertype<on_error>(nested_types[elem_idx]); + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, + /// common_type will be nullptr, we should return nullptr in this case. + if (!common_type) + return nullptr; + common_tuple_types[elem_idx] = common_type; + } + + return std::make_shared<DataTypeTuple>(common_tuple_types); + } + } + + /// For maps + { + bool have_maps = false; + bool all_maps = true; + DataTypes key_types; + DataTypes value_types; + key_types.reserve(types.size()); + value_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const DataTypeMap * type_map = typeid_cast<const DataTypeMap *>(type.get())) + { + have_maps = true; + key_types.emplace_back(type_map->getKeyType()); + value_types.emplace_back(type_map->getValueType()); + } + else + all_maps = false; + } + + if (have_maps) + { + if (!all_maps) + return throwOrReturn<on_error>(types, "because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + auto keys_common_type = getLeastSupertype<on_error>(key_types); + auto values_common_type = getLeastSupertype<on_error>(value_types); + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype for keys or values, + /// keys_common_type or values_common_type will be nullptr, we should return nullptr in this case. + if (!keys_common_type || !values_common_type) + return nullptr; + + return std::make_shared<DataTypeMap>(keys_common_type, values_common_type); + } + } + + /// For LowCardinality. This is above Nullable, because LowCardinality can contain Nullable but cannot be inside Nullable. + { + bool have_low_cardinality = false; + bool have_not_low_cardinality = false; + + DataTypes nested_types; + nested_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const DataTypeLowCardinality * type_low_cardinality = typeid_cast<const DataTypeLowCardinality *>(type.get())) + { + have_low_cardinality = true; + nested_types.emplace_back(type_low_cardinality->getDictionaryType()); + } + else + { + have_not_low_cardinality = true; + nested_types.emplace_back(type); + } + } + + /// All LowCardinality gives LowCardinality. + /// LowCardinality with high cardinality gives high cardinality. + if (have_low_cardinality) + { + if (have_not_low_cardinality) + return getLeastSupertype<on_error>(nested_types); + else + { + auto nested_type = getLeastSupertype<on_error>(nested_types); + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, + /// nested_type will be nullptr, we should return nullptr in this case. + if (!nested_type) + return nullptr; + return std::make_shared<DataTypeLowCardinality>(nested_type); + } + } + } + + /// For Nullable + { + bool have_nullable = false; + + DataTypes nested_types; + nested_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const DataTypeNullable * type_nullable = typeid_cast<const DataTypeNullable *>(type.get())) + { + have_nullable = true; + + if (!type_nullable->onlyNull()) + nested_types.emplace_back(type_nullable->getNestedType()); + } + else + nested_types.emplace_back(type); + } + + if (have_nullable) + { + auto nested_type = getLeastSupertype<on_error>(nested_types); + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, + /// nested_type will be nullptr, we should return nullptr in this case. + if (!nested_type) + return nullptr; + return std::make_shared<DataTypeNullable>(nested_type); + } + } + + /// Non-recursive rules + + TypeIndexSet type_ids; + for (const auto & type : types) + type_ids.insert(type->getTypeId()); + + /// For String and FixedString, or for different FixedStrings, the common type is String. + /// No other types are compatible with Strings. TODO Enums? + { + size_t have_string = type_ids.count(TypeIndex::String); + size_t have_fixed_string = type_ids.count(TypeIndex::FixedString); + + if (have_string || have_fixed_string) + { + bool all_strings = type_ids.size() == (have_string + have_fixed_string); + if (!all_strings) + return throwOrReturn<on_error>(types, "because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + return std::make_shared<DataTypeString>(); + } + } + + /// For Date and DateTime/DateTime64, the common type is DateTime/DateTime64. No other types are compatible. + { + size_t have_date = type_ids.count(TypeIndex::Date); + size_t have_date32 = type_ids.count(TypeIndex::Date32); + size_t have_datetime = type_ids.count(TypeIndex::DateTime); + size_t have_datetime64 = type_ids.count(TypeIndex::DateTime64); + + if (have_date || have_date32 || have_datetime || have_datetime64) + { + bool all_date_or_datetime = type_ids.size() == (have_date + have_date32 + have_datetime + have_datetime64); + if (!all_date_or_datetime) + return throwOrReturn<on_error>(types, + "because some of them are Date/Date32/DateTime/DateTime64 and some of them are not", + ErrorCodes::NO_COMMON_TYPE); + + if (have_datetime64 == 0 && have_date32 == 0) + { + for (const auto & type : types) + { + if (isDateTime(type)) + return type; + } + + return std::make_shared<DataTypeDateTime>(); + } + + /// For Date and Date32, the common type is Date32 + if (have_datetime == 0 && have_datetime64 == 0) + { + for (const auto & type : types) + { + if (isDate32(type)) + return type; + } + } + + /// For Datetime and Date32, the common type is Datetime64 + if (have_datetime == 1 && have_date32 == 1 && have_datetime64 == 0) + { + return std::make_shared<DataTypeDateTime64>(0); + } + + UInt8 max_scale = 0; + size_t max_scale_date_time_index = 0; + + for (size_t i = 0; i < types.size(); ++i) + { + const auto & type = types[i]; + + if (const auto * date_time64_type = typeid_cast<const DataTypeDateTime64 *>(type.get())) + { + const auto scale = date_time64_type->getScale(); + if (scale >= max_scale) + { + max_scale_date_time_index = i; + max_scale = scale; + } + } + } + + return types[max_scale_date_time_index]; + } + } + + /// Decimals + { + size_t have_decimal32 = type_ids.count(TypeIndex::Decimal32); + size_t have_decimal64 = type_ids.count(TypeIndex::Decimal64); + size_t have_decimal128 = type_ids.count(TypeIndex::Decimal128); + size_t have_decimal256 = type_ids.count(TypeIndex::Decimal256); + + if (have_decimal32 || have_decimal64 || have_decimal128 || have_decimal256) + { + size_t num_supported = have_decimal32 + have_decimal64 + have_decimal128 + have_decimal256; + + std::array<TypeIndex, 8> int_ids = {TypeIndex::Int8, TypeIndex::UInt8, TypeIndex::Int16, TypeIndex::UInt16, + TypeIndex::Int32, TypeIndex::UInt32, TypeIndex::Int64, TypeIndex::UInt64}; + + TypeIndex max_int = TypeIndex::Nothing; + for (auto int_id : int_ids) + { + size_t num = type_ids.count(int_id); + num_supported += num; + if (num) + max_int = int_id; + } + + if (num_supported != type_ids.size()) + return throwOrReturn<on_error>(types, "because some of them have no lossless conversion to Decimal", ErrorCodes::NO_COMMON_TYPE); + + UInt32 max_scale = 0; + for (const auto & type : types) + { + auto type_id = type->getTypeId(); + if (type_id != TypeIndex::Decimal32 + && type_id != TypeIndex::Decimal64 + && type_id != TypeIndex::Decimal128 + && type_id != TypeIndex::Decimal256) + { + continue; + } + + UInt32 scale = getDecimalScale(*type); + if (scale > max_scale) + max_scale = scale; + } + + UInt32 min_precision = max_scale + leastDecimalPrecisionFor(max_int); + + /// special cases Int32 -> Dec32, Int64 -> Dec64 + if (max_scale == 0) + { + if (max_int == TypeIndex::Int32) + min_precision = DataTypeDecimal<Decimal32>::maxPrecision(); + else if (max_int == TypeIndex::Int64) + min_precision = DataTypeDecimal<Decimal64>::maxPrecision(); + } + + if (min_precision > DataTypeDecimal<Decimal256>::maxPrecision()) + return throwOrReturn<on_error>(types, "because the least supertype is Decimal(" + + toString(min_precision) + ',' + toString(max_scale) + ')', + ErrorCodes::NO_COMMON_TYPE); + + if (have_decimal256 || min_precision > DataTypeDecimal<Decimal128>::maxPrecision()) + return std::make_shared<DataTypeDecimal<Decimal256>>(DataTypeDecimal<Decimal256>::maxPrecision(), max_scale); + if (have_decimal128 || min_precision > DataTypeDecimal<Decimal64>::maxPrecision()) + return std::make_shared<DataTypeDecimal<Decimal128>>(DataTypeDecimal<Decimal128>::maxPrecision(), max_scale); + if (have_decimal64 || min_precision > DataTypeDecimal<Decimal32>::maxPrecision()) + return std::make_shared<DataTypeDecimal<Decimal64>>(DataTypeDecimal<Decimal64>::maxPrecision(), max_scale); + return std::make_shared<DataTypeDecimal<Decimal32>>(DataTypeDecimal<Decimal32>::maxPrecision(), max_scale); + } + } + + /// For numeric types, the most complicated part. + { + auto numeric_type = getNumericType<on_error>(type_ids); + if (numeric_type) + return numeric_type; + } + + /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases). + return throwOrReturn<on_error>(types, "", ErrorCodes::NO_COMMON_TYPE); +} + +DataTypePtr getLeastSupertypeOrString(const DataTypes & types) +{ + return getLeastSupertype<LeastSupertypeOnError::String>(types); +} + +DataTypePtr tryGetLeastSupertype(const DataTypes & types) +{ + return getLeastSupertype<LeastSupertypeOnError::Null>(types); +} + +template <LeastSupertypeOnError on_error> +DataTypePtr getLeastSupertype(const TypeIndexSet & types) +{ + if (types.empty()) + return std::make_shared<DataTypeNothing>(); + + if (types.size() == 1) + { + WhichDataType which(*types.begin()); + if (which.isNothing()) + return std::make_shared<DataTypeNothing>(); + + #define DISPATCH(TYPE) \ + if (which.idx == TypeIndex::TYPE) \ + return std::make_shared<DataTypeNumber<TYPE>>(); /// NOLINT + + FOR_NUMERIC_TYPES(DISPATCH) + #undef DISPATCH + + if (which.isString()) + return std::make_shared<DataTypeString>(); + + return throwOrReturn<on_error>(types, "because cannot get common type by type indexes with non-simple types", ErrorCodes::NO_COMMON_TYPE); + } + + if (types.contains(TypeIndex::String)) + { + bool only_string = types.size() == 2 && types.contains(TypeIndex::Nothing); + if (!only_string) + return throwOrReturn<on_error>(types, "because some of them are String and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + return std::make_shared<DataTypeString>(); + } + + auto numeric_type = getNumericType<on_error>(types); + if (numeric_type) + return numeric_type; + + return throwOrReturn<on_error>(types, "", ErrorCodes::NO_COMMON_TYPE); +} + +DataTypePtr getLeastSupertypeOrString(const TypeIndexSet & types) +{ + return getLeastSupertype<LeastSupertypeOnError::String>(types); +} + +DataTypePtr tryGetLeastSupertype(const TypeIndexSet & types) +{ + return getLeastSupertype<LeastSupertypeOnError::Null>(types); +} + +template DataTypePtr getLeastSupertype<LeastSupertypeOnError::Throw>(const DataTypes & types); +template DataTypePtr getLeastSupertype<LeastSupertypeOnError::Throw>(const TypeIndexSet & types); + +} diff --git a/contrib/clickhouse/src/DataTypes/getLeastSupertype.h b/contrib/clickhouse/src/DataTypes/getLeastSupertype.h new file mode 100644 index 00000000000..2ef4a0e6850 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/getLeastSupertype.h @@ -0,0 +1,39 @@ +#pragma once +#include <DataTypes/IDataType.h> + +namespace DB +{ + +enum class LeastSupertypeOnError +{ + Throw, + String, + Null, +}; + +/** Get data type that covers all possible values of passed data types. + * If there is no such data type, throws an exception. + * + * Examples: least common supertype for UInt8, Int8 - Int16. + * Examples: there is no least common supertype for Array(UInt8), Int8. + */ +template <LeastSupertypeOnError on_error = LeastSupertypeOnError::Throw> +DataTypePtr getLeastSupertype(const DataTypes & types); + +/// Same as above but return String type instead of throwing exception. +/// All types can be casted to String, because they can be serialized to String. +DataTypePtr getLeastSupertypeOrString(const DataTypes & types); + +/// Same as above but return nullptr instead of throwing exception. +DataTypePtr tryGetLeastSupertype(const DataTypes & types); + +using TypeIndexSet = std::unordered_set<TypeIndex>; + +template <LeastSupertypeOnError on_error = LeastSupertypeOnError::Throw> +DataTypePtr getLeastSupertype(const TypeIndexSet & types); + +DataTypePtr getLeastSupertypeOrString(const TypeIndexSet & types); + +DataTypePtr tryGetLeastSupertype(const TypeIndexSet & types); + +} diff --git a/contrib/clickhouse/src/DataTypes/getMostSubtype.cpp b/contrib/clickhouse/src/DataTypes/getMostSubtype.cpp new file mode 100644 index 00000000000..33b5735456e --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/getMostSubtype.cpp @@ -0,0 +1,398 @@ +#include <IO/WriteBufferFromString.h> +#include <IO/Operators.h> +#include <Common/typeid_cast.h> + +#include <DataTypes/getMostSubtype.h> + +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeNullable.h> +#include <DataTypes/DataTypeNothing.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeDate.h> +#include <DataTypes/DataTypesNumber.h> +#include <DataTypes/DataTypesDecimal.h> + + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int NO_COMMON_TYPE; +} + +namespace +{ +String getExceptionMessagePrefix(const DataTypes & types) +{ + WriteBufferFromOwnString res; + res << "There is no subtype for types "; + + bool first = true; + for (const auto & type : types) + { + if (!first) + res << ", "; + first = false; + + res << type->getName(); + } + + return res.str(); +} + +} + + +DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_nothing, bool force_support_conversion) +{ + auto get_nothing_or_throw = [throw_if_result_is_nothing, & types](const std::string & reason) + { + if (throw_if_result_is_nothing) + throw Exception::createDeprecated(getExceptionMessagePrefix(types) + reason, ErrorCodes::NO_COMMON_TYPE); + return std::make_shared<DataTypeNothing>(); + }; + + /// Trivial cases + + if (types.empty()) + { + if (throw_if_result_is_nothing) + throw Exception(ErrorCodes::NO_COMMON_TYPE, "There is no common type for empty type list"); + return std::make_shared<DataTypeNothing>(); + } + + if (types.size() == 1) + { + if (throw_if_result_is_nothing && typeid_cast<const DataTypeNothing *>(types[0].get())) + throw Exception(ErrorCodes::NO_COMMON_TYPE, "There is no common type for type Nothing"); + return types[0]; + } + + /// All types are equal + { + bool all_equal = true; + for (size_t i = 1, size = types.size(); i < size; ++i) + { + if (!types[i]->equals(*types[0])) + { + all_equal = false; + break; + } + } + + if (all_equal) + return types[0]; + } + + /// Recursive rules + + /// If there are Nothing types, result is Nothing + { + for (const auto & type : types) + if (typeid_cast<const DataTypeNothing *>(type.get())) + return get_nothing_or_throw(" because some of them are Nothing"); + } + + /// For Arrays + { + bool have_array = false; + bool all_arrays = true; + + DataTypes nested_types; + nested_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get())) + { + have_array = true; + nested_types.emplace_back(type_array->getNestedType()); + } + else + all_arrays = false; + } + + if (have_array) + { + if (!all_arrays) + return get_nothing_or_throw(" because some of them are Array and some of them are not"); + + return std::make_shared<DataTypeArray>(getMostSubtype(nested_types, false, force_support_conversion)); + } + } + + /// For tuples + { + bool have_tuple = false; + bool all_tuples = true; + size_t tuple_size = 0; + + std::vector<DataTypes> nested_types; + + for (const auto & type : types) + { + if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get())) + { + if (!have_tuple) + { + tuple_size = type_tuple->getElements().size(); + nested_types.resize(tuple_size); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].reserve(types.size()); + } + else if (tuple_size != type_tuple->getElements().size()) + return get_nothing_or_throw(" because Tuples have different sizes"); + + have_tuple = true; + + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].emplace_back(type_tuple->getElements()[elem_idx]); + } + else + all_tuples = false; + } + + if (have_tuple) + { + if (!all_tuples) + return get_nothing_or_throw(" because some of them are Tuple and some of them are not"); + + DataTypes common_tuple_types(tuple_size); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + common_tuple_types[elem_idx] = + getMostSubtype(nested_types[elem_idx], throw_if_result_is_nothing, force_support_conversion); + + return std::make_shared<DataTypeTuple>(common_tuple_types); + } + } + + /// For Nullable + { + bool all_nullable = true; + bool have_nullable = false; + + DataTypes nested_types; + nested_types.reserve(types.size()); + + for (const auto & type : types) + { + if (const auto * type_nullable = typeid_cast<const DataTypeNullable *>(type.get())) + { + have_nullable = true; + nested_types.emplace_back(type_nullable->getNestedType()); + } + else + { + all_nullable = false; + nested_types.emplace_back(type); + } + } + + if (have_nullable) + { + if (all_nullable || force_support_conversion) + return std::make_shared<DataTypeNullable>(getMostSubtype(nested_types, false, force_support_conversion)); + + return getMostSubtype(nested_types, throw_if_result_is_nothing, force_support_conversion); + } + } + + /// Non-recursive rules + + /// For String and FixedString, the common type is FixedString. + /// For different FixedStrings, the common type is Nothing. + /// No other types are compatible with Strings. TODO Enums? + { + bool have_string = false; + bool all_strings = true; + + DataTypePtr fixed_string_type = nullptr; + + for (const auto & type : types) + { + if (isFixedString(type)) + { + have_string = true; + if (!fixed_string_type) + fixed_string_type = type; + else if (!type->equals(*fixed_string_type)) + return get_nothing_or_throw(" because some of them are FixedStrings with different length"); + } + else if (isString(type)) + have_string = true; + else + all_strings = false; + } + + if (have_string) + { + if (!all_strings) + return get_nothing_or_throw(" because some of them are String/FixedString and some of them are not"); + + return fixed_string_type ? fixed_string_type : std::make_shared<DataTypeString>(); + } + } + + /// For Date and DateTime, the common type is Date. No other types are compatible. + { + bool have_date_or_datetime = false; + bool all_date_or_datetime = true; + + for (const auto & type : types) + { + if (isDate(type) || isDateTime(type) || isDateTime64(type)) + have_date_or_datetime = true; + else + all_date_or_datetime = false; + } + + if (have_date_or_datetime) + { + if (!all_date_or_datetime) + return get_nothing_or_throw(" because some of them are Date/DateTime and some of them are not"); + + return std::make_shared<DataTypeDate>(); + } + } + + /// For numeric types, the most complicated part. + { + bool all_numbers = true; + + size_t min_bits_of_signed_integer = 0; + size_t min_bits_of_unsigned_integer = 0; + size_t min_mantissa_bits_of_floating = 0; + + auto minimize = [](size_t & what, size_t value) + { + if (what == 0 || value < what) + what = value; + }; + + for (const auto & type : types) + { + if (typeid_cast<const DataTypeUInt8 *>(type.get())) + minimize(min_bits_of_unsigned_integer, 8); + else if (typeid_cast<const DataTypeUInt16 *>(type.get())) + minimize(min_bits_of_unsigned_integer, 16); + else if (typeid_cast<const DataTypeUInt32 *>(type.get())) + minimize(min_bits_of_unsigned_integer, 32); + else if (typeid_cast<const DataTypeUInt64 *>(type.get())) + minimize(min_bits_of_unsigned_integer, 64); + else if (typeid_cast<const DataTypeUInt128 *>(type.get())) + minimize(min_bits_of_unsigned_integer, 128); + else if (typeid_cast<const DataTypeUInt256 *>(type.get())) + minimize(min_bits_of_unsigned_integer, 256); + else if (typeid_cast<const DataTypeInt8 *>(type.get())) + minimize(min_bits_of_signed_integer, 8); + else if (typeid_cast<const DataTypeInt16 *>(type.get())) + minimize(min_bits_of_signed_integer, 16); + else if (typeid_cast<const DataTypeInt32 *>(type.get())) + minimize(min_bits_of_signed_integer, 32); + else if (typeid_cast<const DataTypeInt64 *>(type.get())) + minimize(min_bits_of_signed_integer, 64); + else if (typeid_cast<const DataTypeInt128 *>(type.get())) + minimize(min_bits_of_signed_integer, 128); + else if (typeid_cast<const DataTypeInt256 *>(type.get())) + minimize(min_bits_of_signed_integer, 256); + else if (typeid_cast<const DataTypeFloat32 *>(type.get())) + minimize(min_mantissa_bits_of_floating, 24); + else if (typeid_cast<const DataTypeFloat64 *>(type.get())) + minimize(min_mantissa_bits_of_floating, 53); + else + all_numbers = false; + } + + if (min_bits_of_signed_integer || min_bits_of_unsigned_integer || min_mantissa_bits_of_floating) + { + if (!all_numbers) + return get_nothing_or_throw(" because some of them are numbers and some of them are not"); + + /// If the result must be floating. + if (!min_bits_of_signed_integer && !min_bits_of_unsigned_integer) + { + if (min_mantissa_bits_of_floating <= 24) + return std::make_shared<DataTypeFloat32>(); + else if (min_mantissa_bits_of_floating <= 53) + return std::make_shared<DataTypeFloat64>(); + else + throw Exception(ErrorCodes::NO_COMMON_TYPE, + "Logical error: {} but as all data types are floats, " + "we must have found maximum float type", getExceptionMessagePrefix(types)); + } + + /// If there are signed and unsigned types of same bit-width, the result must be unsigned number. + if (min_bits_of_unsigned_integer && + (min_bits_of_signed_integer == 0 || min_bits_of_unsigned_integer <= min_bits_of_signed_integer)) + { + if (min_bits_of_unsigned_integer <= 8) + return std::make_shared<DataTypeUInt8>(); + else if (min_bits_of_unsigned_integer <= 16) + return std::make_shared<DataTypeUInt16>(); + else if (min_bits_of_unsigned_integer <= 32) + return std::make_shared<DataTypeUInt32>(); + else if (min_bits_of_unsigned_integer <= 64) + return std::make_shared<DataTypeUInt64>(); + else if (min_bits_of_unsigned_integer <= 128) + return std::make_shared<DataTypeUInt128>(); + else if (min_bits_of_unsigned_integer <= 256) + return std::make_shared<DataTypeUInt256>(); + else + throw Exception(ErrorCodes::NO_COMMON_TYPE, + "Logical error: {} but as all data types are integers, " + "we must have found maximum unsigned integer type", + getExceptionMessagePrefix(types)); + } + + /// All signed. + { + if (min_bits_of_signed_integer <= 8) + return std::make_shared<DataTypeInt8>(); + else if (min_bits_of_signed_integer <= 16) + return std::make_shared<DataTypeInt16>(); + else if (min_bits_of_signed_integer <= 32) + return std::make_shared<DataTypeInt32>(); + else if (min_bits_of_signed_integer <= 64) + return std::make_shared<DataTypeInt64>(); + else if (min_bits_of_signed_integer <= 128) + return std::make_shared<DataTypeInt128>(); + else if (min_bits_of_signed_integer <= 256) + return std::make_shared<DataTypeInt256>(); + else + throw Exception(ErrorCodes::NO_COMMON_TYPE, + "Logical error: {} but as all data types are integers, " + "we must have found maximum signed integer type", getExceptionMessagePrefix(types)); + } + } + } + + /// Decimals + { + bool all_decimals = true; + UInt32 min_scale = std::numeric_limits<UInt32>::max(); + UInt32 min_precision = std::numeric_limits<UInt32>::max(); + for (const auto & type : types) + { + if (isDecimal(type)) + { + min_scale = std::min(min_scale, getDecimalScale(*type)); + min_precision = std::min(min_precision, getDecimalPrecision(*type)); + } + else + { + all_decimals = false; + break; + } + } + + if (all_decimals) + return createDecimal<DataTypeDecimal>(min_precision, min_scale); + } + + /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases). + return get_nothing_or_throw(""); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/getMostSubtype.h b/contrib/clickhouse/src/DataTypes/getMostSubtype.h new file mode 100644 index 00000000000..c46cf4e2054 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/getMostSubtype.h @@ -0,0 +1,19 @@ +#pragma once + +#include <DataTypes/IDataType.h> + + +namespace DB +{ + +/** Get data type that covers intersection of all possible values of passed data types. + * DataTypeNothing is the most common subtype for all types. + * Examples: most common subtype for UInt16, UInt8 and Int8 - UInt16. + * Examples: most common subtype for Array(UInt8), Int8 is Nothing + * + * If force_support_conversion is true, returns type which may be used to convert each argument to. + * Example: most common subtype for Array(UInt8) and Array(Nullable(Int32)) is Array(Nullable(UInt8) if force_support_conversion is true. + */ +DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_nothing = false, bool force_support_conversion = false); + +} diff --git a/contrib/clickhouse/src/DataTypes/hasNullable.cpp b/contrib/clickhouse/src/DataTypes/hasNullable.cpp new file mode 100644 index 00000000000..908b9880473 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/hasNullable.cpp @@ -0,0 +1,33 @@ +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/hasNullable.h> + +namespace DB +{ + +bool hasNullable(const DataTypePtr & type) +{ + if (isNullableOrLowCardinalityNullable(type)) + return true; + + if (const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(type.get())) + return hasNullable(type_array->getNestedType()); + else if (const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(type.get())) + { + for (const auto & subtype : type_tuple->getElements()) + { + if (hasNullable(subtype)) + return true; + } + return false; + } + else if (const DataTypeMap * type_map = typeid_cast<const DataTypeMap *>(type.get())) + { + // Key type cannot be nullable. We only check value type. + return hasNullable(type_map->getValueType()); + } + return false; +} + +} diff --git a/contrib/clickhouse/src/DataTypes/hasNullable.h b/contrib/clickhouse/src/DataTypes/hasNullable.h new file mode 100644 index 00000000000..271803496f1 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/hasNullable.h @@ -0,0 +1,10 @@ +#pragma once + +#include <DataTypes/IDataType.h> + +namespace DB +{ + +bool hasNullable(const DataTypePtr & type); + +} diff --git a/contrib/clickhouse/src/DataTypes/registerDataTypeDateTime.cpp b/contrib/clickhouse/src/DataTypes/registerDataTypeDateTime.cpp new file mode 100644 index 00000000000..2b5c4a0a143 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/registerDataTypeDateTime.cpp @@ -0,0 +1,118 @@ + +#include <Core/Field.h> +#include <Parsers/IAST.h> +#include <Parsers/ASTLiteral.h> +#include <DataTypes/IDataType.h> +#include <DataTypes/DataTypeDateTime.h> +#include <DataTypes/DataTypeDateTime64.h> +#include <DataTypes/DataTypeFactory.h> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +enum class ArgumentKind +{ + Optional, + Mandatory +}; + +PreformattedMessage getExceptionMessage( + const String & message, size_t argument_index, const char * argument_name, + const std::string & context_data_type_name, Field::Types::Which field_type) +{ + return PreformattedMessage::create("Parameter #{} '{}' for {}{}, expected {} literal", + argument_index, argument_name, context_data_type_name, message, field_type); +} + +template <typename T, ArgumentKind Kind> +std::conditional_t<Kind == ArgumentKind::Optional, std::optional<T>, T> +getArgument(const ASTPtr & arguments, size_t argument_index, const char * argument_name [[maybe_unused]], const std::string context_data_type_name) +{ + using NearestResultType = NearestFieldType<T>; + const auto field_type = Field::TypeToEnum<NearestResultType>::value; + const ASTLiteral * argument = nullptr; + + if (!arguments || arguments->children.size() <= argument_index + || !(argument = arguments->children[argument_index]->as<ASTLiteral>()) + || argument->value.getType() != field_type) + { + if constexpr (Kind == ArgumentKind::Optional) + return {}; + else + { + if (argument && argument->value.getType() != field_type) + throw Exception(getExceptionMessage(fmt::format(" has wrong type: {}", argument->value.getTypeName()), + argument_index, argument_name, context_data_type_name, field_type), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + else + throw Exception(getExceptionMessage(" is missing", argument_index, argument_name, context_data_type_name, field_type), + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + } + } + + return argument->value.get<NearestResultType>(); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + return std::make_shared<DataTypeDateTime>(); + + const auto scale = getArgument<UInt64, ArgumentKind::Optional>(arguments, 0, "scale", "DateTime"); + const auto timezone = getArgument<String, ArgumentKind::Optional>(arguments, scale ? 1 : 0, "timezone", "DateTime"); + + if (!scale && !timezone) + throw Exception(getExceptionMessage(" has wrong type: ", 0, "scale", "DateTime", Field::Types::Which::UInt64), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + /// If scale is defined, the data type is DateTime when scale = 0 otherwise the data type is DateTime64 + if (scale && scale.value() != 0) + return std::make_shared<DataTypeDateTime64>(scale.value(), timezone.value_or(String{})); + + return std::make_shared<DataTypeDateTime>(timezone.value_or(String{})); +} + +static DataTypePtr create32(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + return std::make_shared<DataTypeDateTime>(); + + if (arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "DateTime32 data type can optionally have only one argument - time zone name"); + + const auto timezone = getArgument<String, ArgumentKind::Mandatory>(arguments, 0, "timezone", "DateTime32"); + + return std::make_shared<DataTypeDateTime>(timezone); +} + +static DataTypePtr create64(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + return std::make_shared<DataTypeDateTime64>(DataTypeDateTime64::default_scale); + + if (arguments->children.size() > 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "DateTime64 data type can optionally have two argument - scale and time zone name"); + + const auto scale = getArgument<UInt64, ArgumentKind::Mandatory>(arguments, 0, "scale", "DateTime64"); + const auto timezone = getArgument<String, ArgumentKind::Optional>(arguments, 1, "timezone", "DateTime64"); + + return std::make_shared<DataTypeDateTime64>(scale, timezone.value_or(String{})); +} + +void registerDataTypeDateTime(DataTypeFactory & factory) +{ + factory.registerDataType("DateTime", create, DataTypeFactory::CaseInsensitive); + factory.registerDataType("DateTime32", create32, DataTypeFactory::CaseInsensitive); + factory.registerDataType("DateTime64", create64, DataTypeFactory::CaseInsensitive); + + factory.registerAlias("TIMESTAMP", "DateTime", DataTypeFactory::CaseInsensitive); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/transformTypesRecursively.cpp b/contrib/clickhouse/src/DataTypes/transformTypesRecursively.cpp new file mode 100644 index 00000000000..cdf221a6b72 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/transformTypesRecursively.cpp @@ -0,0 +1,172 @@ +#include <DataTypes/transformTypesRecursively.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeMap.h> +#include <DataTypes/DataTypeTuple.h> +#include <DataTypes/DataTypeNullable.h> + + +namespace DB +{ + +TypeIndexesSet getTypesIndexes(const DataTypes & types) +{ + TypeIndexesSet type_indexes; + for (const auto & type : types) + type_indexes.insert(type->getTypeId()); + return type_indexes; +} + +void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &, TypeIndexesSet &)> transform_simple_types, std::function<void(DataTypes &, TypeIndexesSet &)> transform_complex_types) +{ + TypeIndexesSet type_indexes = getTypesIndexes(types); + + /// Nullable + if (type_indexes.contains(TypeIndex::Nullable)) + { + std::vector<UInt8> is_nullable; + is_nullable.reserve(types.size()); + DataTypes nested_types; + nested_types.reserve(types.size()); + for (const auto & type : types) + { + if (const DataTypeNullable * type_nullable = typeid_cast<const DataTypeNullable *>(type.get())) + { + is_nullable.push_back(1); + nested_types.push_back(type_nullable->getNestedType()); + } + else + { + is_nullable.push_back(0); + nested_types.push_back(type); + } + } + + transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types); + for (size_t i = 0; i != types.size(); ++i) + { + /// Type could be changed so it cannot be inside Nullable anymore. + if (is_nullable[i] && nested_types[i]->canBeInsideNullable()) + types[i] = makeNullable(nested_types[i]); + else + types[i] = nested_types[i]; + } + + if (transform_complex_types) + { + /// Some types could be changed. + type_indexes = getTypesIndexes(types); + transform_complex_types(types, type_indexes); + } + + return; + } + + /// Arrays + if (type_indexes.contains(TypeIndex::Array)) + { + /// All types are Array + if (type_indexes.size() == 1) + { + DataTypes nested_types; + for (const auto & type : types) + nested_types.push_back(typeid_cast<const DataTypeArray *>(type.get())->getNestedType()); + + transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types); + for (size_t i = 0; i != types.size(); ++i) + types[i] = std::make_shared<DataTypeArray>(nested_types[i]); + } + + if (transform_complex_types) + transform_complex_types(types, type_indexes); + + return; + } + + /// Tuples + if (type_indexes.contains(TypeIndex::Tuple)) + { + /// All types are Tuple + if (type_indexes.size() == 1) + { + std::vector<DataTypes> nested_types; + const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(types[0].get()); + size_t tuple_size = type_tuple->getElements().size(); + nested_types.resize(tuple_size); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].reserve(types.size()); + + bool sizes_are_equal = true; + for (const auto & type : types) + { + type_tuple = typeid_cast<const DataTypeTuple *>(type.get()); + if (type_tuple->getElements().size() != tuple_size) + { + sizes_are_equal = false; + break; + } + + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + nested_types[elem_idx].emplace_back(type_tuple->getElements()[elem_idx]); + } + + if (sizes_are_equal) + { + std::vector<DataTypes> transposed_nested_types(types.size()); + for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) + { + transformTypesRecursively(nested_types[elem_idx], transform_simple_types, transform_complex_types); + for (size_t i = 0; i != types.size(); ++i) + transposed_nested_types[i].push_back(nested_types[elem_idx][i]); + } + + for (size_t i = 0; i != types.size(); ++i) + types[i] = std::make_shared<DataTypeTuple>(transposed_nested_types[i]); + } + } + + if (transform_complex_types) + transform_complex_types(types, type_indexes); + + return; + } + + /// Maps + if (type_indexes.contains(TypeIndex::Map)) + { + /// All types are Map + if (type_indexes.size() == 1) + { + DataTypes key_types; + DataTypes value_types; + key_types.reserve(types.size()); + value_types.reserve(types.size()); + for (const auto & type : types) + { + const DataTypeMap * type_map = typeid_cast<const DataTypeMap *>(type.get()); + key_types.emplace_back(type_map->getKeyType()); + value_types.emplace_back(type_map->getValueType()); + } + + transformTypesRecursively(key_types, transform_simple_types, transform_complex_types); + transformTypesRecursively(value_types, transform_simple_types, transform_complex_types); + + for (size_t i = 0; i != types.size(); ++i) + types[i] = std::make_shared<DataTypeMap>(key_types[i], value_types[i]); + } + + if (transform_complex_types) + transform_complex_types(types, type_indexes); + + return; + } + + transform_simple_types(types, type_indexes); +} + +void callOnNestedSimpleTypes(DataTypePtr & type, std::function<void(DataTypePtr &)> callback) +{ + DataTypes types = {type}; + transformTypesRecursively(types, [callback](auto & data_types, TypeIndexesSet &){ callback(data_types[0]); }, {}); +} + +} diff --git a/contrib/clickhouse/src/DataTypes/transformTypesRecursively.h b/contrib/clickhouse/src/DataTypes/transformTypesRecursively.h new file mode 100644 index 00000000000..f9c776b4205 --- /dev/null +++ b/contrib/clickhouse/src/DataTypes/transformTypesRecursively.h @@ -0,0 +1,19 @@ +#pragma once + +#include <DataTypes/IDataType.h> +#include <functional> + +namespace DB +{ + +/// Function that applies custom transformation functions to provided types recursively. +/// Implementation is similar to function getLeastSuperType: +/// If all types are Array/Map/Tuple/Nullable, this function will be called to nested types. +/// If not all types are the same complex type (Array/Map/Tuple), this function won't be called to nested types. +/// Function transform_simple_types will be applied to resulting simple types after all recursive calls. +/// Function transform_complex_types will be applied to complex types (Array/Map/Tuple) after recursive call to their nested types. +void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &, TypeIndexesSet &)> transform_simple_types, std::function<void(DataTypes &, TypeIndexesSet &)> transform_complex_types); + +void callOnNestedSimpleTypes(DataTypePtr & type, std::function<void(DataTypePtr &)> callback); + +} |
