diff options
| author | vitalyisaev <[email protected]> | 2023-11-14 09:58:56 +0300 |
|---|---|---|
| committer | vitalyisaev <[email protected]> | 2023-11-14 10:20:20 +0300 |
| commit | c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch) | |
| tree | cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp | |
| parent | d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff) | |
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp')
| -rw-r--r-- | contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp | 446 |
1 files changed, 446 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp b/contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp new file mode 100644 index 00000000000..f85b2596530 --- /dev/null +++ b/contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp @@ -0,0 +1,446 @@ +#include <Functions/FunctionGenerateRandomStructure.h> +#include <Functions/FunctionFactory.h> +#include <Functions/FunctionHelpers.h> +#include <Functions/IFunction.h> +#include <Columns/ColumnString.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypeFixedString.h> +#include <Interpreters/Context.h> +#include <Common/randomSeed.h> +#include <Common/FunctionDocumentation.h> +#include <IO/WriteHelpers.h> +#include <IO/WriteBufferFromVector.h> + +#include <pcg_random.hpp> + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; +} + +namespace +{ + const size_t MAX_NUMBER_OF_COLUMNS = 128; + const size_t MAX_TUPLE_ELEMENTS = 16; + const size_t MAX_DATETIME64_PRECISION = 9; + const size_t MAX_DECIMAL32_PRECISION = 9; + const size_t MAX_DECIMAL64_PRECISION = 18; + const size_t MAX_DECIMAL128_PRECISION = 38; + const size_t MAX_DECIMAL256_PRECISION = 76; + const size_t MAX_DEPTH = 16; + + constexpr std::array<TypeIndex, 29> simple_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::IPv4, + TypeIndex::IPv6, + TypeIndex::UUID, + }; + + constexpr std::array<TypeIndex, 5> complex_types + { + TypeIndex::Nullable, + TypeIndex::LowCardinality, + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + }; + + constexpr std::array<TypeIndex, 22> map_key_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::UUID, + TypeIndex::LowCardinality, + }; + + constexpr std::array<TypeIndex, 22> suspicious_lc_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::IPv6, + TypeIndex::UUID, + }; + + template <bool allow_complex_types> + constexpr auto getAllTypes() + { + constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; + constexpr size_t result_size = simple_types.size() + complex_types_size; + std::array<TypeIndex, result_size> result; + size_t index = 0; + + for (size_t i = 0; i != simple_types.size(); ++i, ++index) + result[index] = simple_types[i]; + + for (size_t i = 0; i != complex_types_size; ++i, ++index) + result[index] = complex_types[i]; + + return result; + } + + size_t generateNumberOfColumns(pcg64 & rng) + { + return rng() % MAX_NUMBER_OF_COLUMNS + 1; + } + + void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + bool make_nullable = rng() % 2; + if (make_nullable) + writeCString("Nullable(", buf); + + if (allow_suspicious_lc_types) + { + TypeIndex type = suspicious_lc_types[rng() % suspicious_lc_types.size()]; + + if (type == TypeIndex::FixedString) + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + else + writeString(magic_enum::enum_name<TypeIndex>(type), buf); + } + else + { + /// Support only String and FixedString. + if (rng() % 2) + writeCString("String", buf); + else + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + } + + if (make_nullable) + writeChar(')', buf); + } + + void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value) + { + /// Don't generate big enums, because it will lead to really big result + /// and slowness of this function, and it can lead to `Max query size exceeded` + /// while using this function with generateRandom. + size_t num_values = rng() % 16 + 1; + std::vector<Int16> values(num_values); + + /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1]. + for (Int16 & x : values) + x = rng() % (2 * max_value + 3 - num_values) - max_value - 1; + /// Make all numbers unique. + std::sort(values.begin(), values.end()); + for (size_t i = 0; i < num_values; ++i) + values[i] += i; + std::shuffle(values.begin(), values.end(), rng); + for (size_t i = 0; i != num_values; ++i) + { + if (i != 0) + writeCString(", ", buf); + writeString("'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(values[i]), buf); + } + } + + void writeMapKeyType(const String & column_name, pcg64 & rng, WriteBuffer & buf) + { + TypeIndex type = map_key_types[rng() % map_key_types.size()]; + switch (type) + { + case TypeIndex::FixedString: + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + break; + case TypeIndex::LowCardinality: + writeCString("LowCardinality(", buf); + /// Map key supports only String and FixedString inside LowCardinality. + if (rng() % 2) + writeCString("String", buf); + else + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + writeChar(')', buf); + break; + case TypeIndex::Enum8: + writeCString("Enum8(", buf); + writeEnumValues(column_name, rng, buf, INT8_MAX); + writeChar(')', buf); + break; + case TypeIndex::Enum16: + writeCString("Enum16(", buf); + writeEnumValues(column_name, rng, buf, INT16_MAX); + writeChar(')', buf); + break; + default: + writeString(magic_enum::enum_name<TypeIndex>(type), buf); + break; + } + } + + template <bool allow_complex_types = true> + void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0) + { + if (allow_complex_types && depth > MAX_DEPTH) + writeRandomType<false>(column_name, rng, buf, depth); + + constexpr auto all_types = getAllTypes<allow_complex_types>(); + auto type = all_types[rng() % all_types.size()]; + + switch (type) + { + case TypeIndex::UInt8: + if (rng() % 2) + writeCString("UInt8", buf); + else + writeCString("Bool", buf); + return; + case TypeIndex::FixedString: + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + return; + case TypeIndex::DateTime64: + writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal32: + writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal64: + writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal128: + writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal256: + writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Enum8: + writeCString("Enum8(", buf); + writeEnumValues(column_name, rng, buf, INT8_MAX); + writeChar(')', buf); + return; + case TypeIndex::Enum16: + writeCString("Enum16(", buf); + writeEnumValues(column_name, rng, buf, INT16_MAX); + writeChar(')', buf); + return; + case TypeIndex::LowCardinality: + writeCString("LowCardinality(", buf); + writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); + writeChar(')', buf); + return; + case TypeIndex::Nullable: + { + writeCString("Nullable(", buf); + writeRandomType<false>(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Array: + { + writeCString("Array(", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Map: + { + writeCString("Map(", buf); + writeMapKeyType(column_name, rng, buf); + writeCString(", ", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Tuple: + { + size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; + bool generate_nested = rng() % 2; + bool generate_named_tuple = rng() % 2; + if (generate_nested) + writeCString("Nested(", buf); + else + writeCString("Tuple(", buf); + + for (size_t i = 0; i != elements; ++i) + { + if (i != 0) + writeCString(", ", buf); + + String element_name = "e" + std::to_string(i + 1); + if (generate_named_tuple || generate_nested) + { + writeString(element_name, buf); + writeChar(' ', buf); + } + writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1); + } + writeChar(')', buf); + return; + } + default: + writeString(magic_enum::enum_name<TypeIndex>(type), buf); + return; + } + } + + void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + for (size_t i = 0; i != number_of_columns; ++i) + { + if (i != 0) + writeCString(", ", buf); + String column_name = "c" + std::to_string(i + 1); + writeString(column_name, buf); + writeChar(' ', buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types); + } + } +} + +DataTypePtr FunctionGenerateRandomStructure::getReturnTypeImpl(const DataTypes & arguments) const +{ + if (arguments.size() > 2) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", + getName(), arguments.size()); + + + for (size_t i = 0; i != arguments.size(); ++i) + { + if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", + arguments[i]->getName(), + i + 1, + getName()); + } + } + + return std::make_shared<DataTypeString>(); +} + +ColumnPtr FunctionGenerateRandomStructure::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const +{ + size_t seed = randomSeed(); + size_t number_of_columns = 0; + + if (!arguments.empty() && !arguments[0].column->onlyNull()) + { + number_of_columns = arguments[0].column->getUInt(0); + if (number_of_columns > MAX_NUMBER_OF_COLUMNS) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Maximum allowed number of columns is {}, got {}", + MAX_NUMBER_OF_COLUMNS, + number_of_columns); + } + + if (arguments.size() > 1 && !arguments[1].column->onlyNull()) + seed = arguments[1].column->getUInt(0); + + pcg64 rng(seed); + if (number_of_columns == 0) + number_of_columns = generateNumberOfColumns(rng); + + auto col_res = ColumnString::create(); + auto & string_column = assert_cast<ColumnString &>(*col_res); + auto & chars = string_column.getChars(); + WriteBufferFromVector buf(chars); + writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types); + buf.finalize(); + chars.push_back(0); + string_column.getOffsets().push_back(chars.size()); + return ColumnConst::create(std::move(col_res), input_rows_count); +} + +String FunctionGenerateRandomStructure::generateRandomStructure(size_t seed, const ContextPtr & context) +{ + pcg64 rng(seed); + size_t number_of_columns = generateNumberOfColumns(rng); + WriteBufferFromOwnString buf; + writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types); + return buf.str(); +} + +REGISTER_FUNCTION(GenerateRandomStructure) +{ + factory.registerFunction<FunctionGenerateRandomStructure>(FunctionDocumentation + { + .description=R"( +Generates a random table structure. +This function takes 2 optional constant arguments: +the number of columns in the result structure (random by default) and random seed (random by default) +The maximum number of columns is 128. +The function returns a value of type String. +)", + .examples{ + {"random", "SELECT generateRandomStructure()", "c1 UInt32, c2 FixedString(25)"}, + {"with specified number of columns", "SELECT generateRandomStructure(3)", "c1 String, c2 Array(Int32), c3 LowCardinality(String)"}, + {"with specified seed", "SELECT generateRandomStructure(1, 42)", "c1 UInt128"}, + }, + .categories{"Random"} + }, + FunctionFactory::CaseSensitive); +} + +} |
