summaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp
diff options
context:
space:
mode:
authorvitalyisaev <[email protected]>2023-11-14 09:58:56 +0300
committervitalyisaev <[email protected]>2023-11-14 10:20:20 +0300
commitc2b2dfd9827a400a8495e172a56343462e3ceb82 (patch)
treecd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp
parentd4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff)
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp')
-rw-r--r--contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp446
1 files changed, 446 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp b/contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp
new file mode 100644
index 00000000000..f85b2596530
--- /dev/null
+++ b/contrib/clickhouse/src/Functions/FunctionGenerateRandomStructure.cpp
@@ -0,0 +1,446 @@
+#include <Functions/FunctionGenerateRandomStructure.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <Columns/ColumnString.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeFixedString.h>
+#include <Interpreters/Context.h>
+#include <Common/randomSeed.h>
+#include <Common/FunctionDocumentation.h>
+#include <IO/WriteHelpers.h>
+#include <IO/WriteBufferFromVector.h>
+
+#include <pcg_random.hpp>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+ extern const int BAD_ARGUMENTS;
+}
+
+namespace
+{
+ const size_t MAX_NUMBER_OF_COLUMNS = 128;
+ const size_t MAX_TUPLE_ELEMENTS = 16;
+ const size_t MAX_DATETIME64_PRECISION = 9;
+ const size_t MAX_DECIMAL32_PRECISION = 9;
+ const size_t MAX_DECIMAL64_PRECISION = 18;
+ const size_t MAX_DECIMAL128_PRECISION = 38;
+ const size_t MAX_DECIMAL256_PRECISION = 76;
+ const size_t MAX_DEPTH = 16;
+
+ constexpr std::array<TypeIndex, 29> simple_types
+ {
+ TypeIndex::Int8,
+ TypeIndex::UInt8,
+ TypeIndex::Int16,
+ TypeIndex::UInt16,
+ TypeIndex::Int32,
+ TypeIndex::UInt32,
+ TypeIndex::Int64,
+ TypeIndex::UInt64,
+ TypeIndex::Int128,
+ TypeIndex::UInt128,
+ TypeIndex::Int256,
+ TypeIndex::UInt256,
+ TypeIndex::Float32,
+ TypeIndex::Float64,
+ TypeIndex::Decimal32,
+ TypeIndex::Decimal64,
+ TypeIndex::Decimal128,
+ TypeIndex::Decimal256,
+ TypeIndex::Date,
+ TypeIndex::Date32,
+ TypeIndex::DateTime,
+ TypeIndex::DateTime64,
+ TypeIndex::String,
+ TypeIndex::FixedString,
+ TypeIndex::Enum8,
+ TypeIndex::Enum16,
+ TypeIndex::IPv4,
+ TypeIndex::IPv6,
+ TypeIndex::UUID,
+ };
+
+ constexpr std::array<TypeIndex, 5> complex_types
+ {
+ TypeIndex::Nullable,
+ TypeIndex::LowCardinality,
+ TypeIndex::Array,
+ TypeIndex::Tuple,
+ TypeIndex::Map,
+ };
+
+ constexpr std::array<TypeIndex, 22> map_key_types
+ {
+ TypeIndex::Int8,
+ TypeIndex::UInt8,
+ TypeIndex::Int16,
+ TypeIndex::UInt16,
+ TypeIndex::Int32,
+ TypeIndex::UInt32,
+ TypeIndex::Int64,
+ TypeIndex::UInt64,
+ TypeIndex::Int128,
+ TypeIndex::UInt128,
+ TypeIndex::Int256,
+ TypeIndex::UInt256,
+ TypeIndex::Date,
+ TypeIndex::Date32,
+ TypeIndex::DateTime,
+ TypeIndex::String,
+ TypeIndex::FixedString,
+ TypeIndex::IPv4,
+ TypeIndex::Enum8,
+ TypeIndex::Enum16,
+ TypeIndex::UUID,
+ TypeIndex::LowCardinality,
+ };
+
+ constexpr std::array<TypeIndex, 22> suspicious_lc_types
+ {
+ TypeIndex::Int8,
+ TypeIndex::UInt8,
+ TypeIndex::Int16,
+ TypeIndex::UInt16,
+ TypeIndex::Int32,
+ TypeIndex::UInt32,
+ TypeIndex::Int64,
+ TypeIndex::UInt64,
+ TypeIndex::Int128,
+ TypeIndex::UInt128,
+ TypeIndex::Int256,
+ TypeIndex::UInt256,
+ TypeIndex::Float32,
+ TypeIndex::Float64,
+ TypeIndex::Date,
+ TypeIndex::Date32,
+ TypeIndex::DateTime,
+ TypeIndex::String,
+ TypeIndex::FixedString,
+ TypeIndex::IPv4,
+ TypeIndex::IPv6,
+ TypeIndex::UUID,
+ };
+
+ template <bool allow_complex_types>
+ constexpr auto getAllTypes()
+ {
+ constexpr size_t complex_types_size = complex_types.size() * allow_complex_types;
+ constexpr size_t result_size = simple_types.size() + complex_types_size;
+ std::array<TypeIndex, result_size> result;
+ size_t index = 0;
+
+ for (size_t i = 0; i != simple_types.size(); ++i, ++index)
+ result[index] = simple_types[i];
+
+ for (size_t i = 0; i != complex_types_size; ++i, ++index)
+ result[index] = complex_types[i];
+
+ return result;
+ }
+
+ size_t generateNumberOfColumns(pcg64 & rng)
+ {
+ return rng() % MAX_NUMBER_OF_COLUMNS + 1;
+ }
+
+ void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types)
+ {
+ bool make_nullable = rng() % 2;
+ if (make_nullable)
+ writeCString("Nullable(", buf);
+
+ if (allow_suspicious_lc_types)
+ {
+ TypeIndex type = suspicious_lc_types[rng() % suspicious_lc_types.size()];
+
+ if (type == TypeIndex::FixedString)
+ writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
+ else
+ writeString(magic_enum::enum_name<TypeIndex>(type), buf);
+ }
+ else
+ {
+ /// Support only String and FixedString.
+ if (rng() % 2)
+ writeCString("String", buf);
+ else
+ writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
+ }
+
+ if (make_nullable)
+ writeChar(')', buf);
+ }
+
+ void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value)
+ {
+ /// Don't generate big enums, because it will lead to really big result
+ /// and slowness of this function, and it can lead to `Max query size exceeded`
+ /// while using this function with generateRandom.
+ size_t num_values = rng() % 16 + 1;
+ std::vector<Int16> values(num_values);
+
+ /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1].
+ for (Int16 & x : values)
+ x = rng() % (2 * max_value + 3 - num_values) - max_value - 1;
+ /// Make all numbers unique.
+ std::sort(values.begin(), values.end());
+ for (size_t i = 0; i < num_values; ++i)
+ values[i] += i;
+ std::shuffle(values.begin(), values.end(), rng);
+ for (size_t i = 0; i != num_values; ++i)
+ {
+ if (i != 0)
+ writeCString(", ", buf);
+ writeString("'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(values[i]), buf);
+ }
+ }
+
+ void writeMapKeyType(const String & column_name, pcg64 & rng, WriteBuffer & buf)
+ {
+ TypeIndex type = map_key_types[rng() % map_key_types.size()];
+ switch (type)
+ {
+ case TypeIndex::FixedString:
+ writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
+ break;
+ case TypeIndex::LowCardinality:
+ writeCString("LowCardinality(", buf);
+ /// Map key supports only String and FixedString inside LowCardinality.
+ if (rng() % 2)
+ writeCString("String", buf);
+ else
+ writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
+ writeChar(')', buf);
+ break;
+ case TypeIndex::Enum8:
+ writeCString("Enum8(", buf);
+ writeEnumValues(column_name, rng, buf, INT8_MAX);
+ writeChar(')', buf);
+ break;
+ case TypeIndex::Enum16:
+ writeCString("Enum16(", buf);
+ writeEnumValues(column_name, rng, buf, INT16_MAX);
+ writeChar(')', buf);
+ break;
+ default:
+ writeString(magic_enum::enum_name<TypeIndex>(type), buf);
+ break;
+ }
+ }
+
+ template <bool allow_complex_types = true>
+ void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0)
+ {
+ if (allow_complex_types && depth > MAX_DEPTH)
+ writeRandomType<false>(column_name, rng, buf, depth);
+
+ constexpr auto all_types = getAllTypes<allow_complex_types>();
+ auto type = all_types[rng() % all_types.size()];
+
+ switch (type)
+ {
+ case TypeIndex::UInt8:
+ if (rng() % 2)
+ writeCString("UInt8", buf);
+ else
+ writeCString("Bool", buf);
+ return;
+ case TypeIndex::FixedString:
+ writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
+ return;
+ case TypeIndex::DateTime64:
+ writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf);
+ return;
+ case TypeIndex::Decimal32:
+ writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf);
+ return;
+ case TypeIndex::Decimal64:
+ writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf);
+ return;
+ case TypeIndex::Decimal128:
+ writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf);
+ return;
+ case TypeIndex::Decimal256:
+ writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf);
+ return;
+ case TypeIndex::Enum8:
+ writeCString("Enum8(", buf);
+ writeEnumValues(column_name, rng, buf, INT8_MAX);
+ writeChar(')', buf);
+ return;
+ case TypeIndex::Enum16:
+ writeCString("Enum16(", buf);
+ writeEnumValues(column_name, rng, buf, INT16_MAX);
+ writeChar(')', buf);
+ return;
+ case TypeIndex::LowCardinality:
+ writeCString("LowCardinality(", buf);
+ writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types);
+ writeChar(')', buf);
+ return;
+ case TypeIndex::Nullable:
+ {
+ writeCString("Nullable(", buf);
+ writeRandomType<false>(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);
+ writeChar(')', buf);
+ return;
+ }
+ case TypeIndex::Array:
+ {
+ writeCString("Array(", buf);
+ writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);
+ writeChar(')', buf);
+ return;
+ }
+ case TypeIndex::Map:
+ {
+ writeCString("Map(", buf);
+ writeMapKeyType(column_name, rng, buf);
+ writeCString(", ", buf);
+ writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);
+ writeChar(')', buf);
+ return;
+ }
+ case TypeIndex::Tuple:
+ {
+ size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1;
+ bool generate_nested = rng() % 2;
+ bool generate_named_tuple = rng() % 2;
+ if (generate_nested)
+ writeCString("Nested(", buf);
+ else
+ writeCString("Tuple(", buf);
+
+ for (size_t i = 0; i != elements; ++i)
+ {
+ if (i != 0)
+ writeCString(", ", buf);
+
+ String element_name = "e" + std::to_string(i + 1);
+ if (generate_named_tuple || generate_nested)
+ {
+ writeString(element_name, buf);
+ writeChar(' ', buf);
+ }
+ writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1);
+ }
+ writeChar(')', buf);
+ return;
+ }
+ default:
+ writeString(magic_enum::enum_name<TypeIndex>(type), buf);
+ return;
+ }
+ }
+
+ void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types)
+ {
+ for (size_t i = 0; i != number_of_columns; ++i)
+ {
+ if (i != 0)
+ writeCString(", ", buf);
+ String column_name = "c" + std::to_string(i + 1);
+ writeString(column_name, buf);
+ writeChar(' ', buf);
+ writeRandomType(column_name, rng, buf, allow_suspicious_lc_types);
+ }
+ }
+}
+
+DataTypePtr FunctionGenerateRandomStructure::getReturnTypeImpl(const DataTypes & arguments) const
+{
+ if (arguments.size() > 2)
+ throw Exception(
+ ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+ "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2",
+ getName(), arguments.size());
+
+
+ for (size_t i = 0; i != arguments.size(); ++i)
+ {
+ if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull())
+ {
+ throw Exception(
+ ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+ "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null",
+ arguments[i]->getName(),
+ i + 1,
+ getName());
+ }
+ }
+
+ return std::make_shared<DataTypeString>();
+}
+
+ColumnPtr FunctionGenerateRandomStructure::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const
+{
+ size_t seed = randomSeed();
+ size_t number_of_columns = 0;
+
+ if (!arguments.empty() && !arguments[0].column->onlyNull())
+ {
+ number_of_columns = arguments[0].column->getUInt(0);
+ if (number_of_columns > MAX_NUMBER_OF_COLUMNS)
+ throw Exception(
+ ErrorCodes::BAD_ARGUMENTS,
+ "Maximum allowed number of columns is {}, got {}",
+ MAX_NUMBER_OF_COLUMNS,
+ number_of_columns);
+ }
+
+ if (arguments.size() > 1 && !arguments[1].column->onlyNull())
+ seed = arguments[1].column->getUInt(0);
+
+ pcg64 rng(seed);
+ if (number_of_columns == 0)
+ number_of_columns = generateNumberOfColumns(rng);
+
+ auto col_res = ColumnString::create();
+ auto & string_column = assert_cast<ColumnString &>(*col_res);
+ auto & chars = string_column.getChars();
+ WriteBufferFromVector buf(chars);
+ writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types);
+ buf.finalize();
+ chars.push_back(0);
+ string_column.getOffsets().push_back(chars.size());
+ return ColumnConst::create(std::move(col_res), input_rows_count);
+}
+
+String FunctionGenerateRandomStructure::generateRandomStructure(size_t seed, const ContextPtr & context)
+{
+ pcg64 rng(seed);
+ size_t number_of_columns = generateNumberOfColumns(rng);
+ WriteBufferFromOwnString buf;
+ writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types);
+ return buf.str();
+}
+
+REGISTER_FUNCTION(GenerateRandomStructure)
+{
+ factory.registerFunction<FunctionGenerateRandomStructure>(FunctionDocumentation
+ {
+ .description=R"(
+Generates a random table structure.
+This function takes 2 optional constant arguments:
+the number of columns in the result structure (random by default) and random seed (random by default)
+The maximum number of columns is 128.
+The function returns a value of type String.
+)",
+ .examples{
+ {"random", "SELECT generateRandomStructure()", "c1 UInt32, c2 FixedString(25)"},
+ {"with specified number of columns", "SELECT generateRandomStructure(3)", "c1 String, c2 Array(Int32), c3 LowCardinality(String)"},
+ {"with specified seed", "SELECT generateRandomStructure(1, 42)", "c1 UInt128"},
+ },
+ .categories{"Random"}
+ },
+ FunctionFactory::CaseSensitive);
+}
+
+}