diff options
| author | vitalyisaev <[email protected]> | 2023-11-14 09:58:56 +0300 |
|---|---|---|
| committer | vitalyisaev <[email protected]> | 2023-11-14 10:20:20 +0300 |
| commit | c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch) | |
| tree | cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/array/arrayDistinct.cpp | |
| parent | d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff) | |
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/Functions/array/arrayDistinct.cpp')
| -rw-r--r-- | contrib/clickhouse/src/Functions/array/arrayDistinct.cpp | 295 |
1 files changed, 295 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Functions/array/arrayDistinct.cpp b/contrib/clickhouse/src/Functions/array/arrayDistinct.cpp new file mode 100644 index 00000000000..ea331d6bdad --- /dev/null +++ b/contrib/clickhouse/src/Functions/array/arrayDistinct.cpp @@ -0,0 +1,295 @@ +#include <Functions/IFunction.h> +#include <Functions/FunctionFactory.h> +#include <Functions/FunctionHelpers.h> +#include <DataTypes/DataTypeArray.h> +#include <DataTypes/DataTypeNullable.h> +#include <Columns/ColumnArray.h> +#include <Columns/ColumnNullable.h> +#include <Columns/ColumnString.h> +#include <Common/HashTable/ClearableHashSet.h> +#include <Common/SipHash.h> +#include <Common/assert_cast.h> + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + + +/// Find different elements in an array. +class FunctionArrayDistinct : public IFunction +{ +public: + static constexpr auto name = "arrayDistinct"; + + static FunctionPtr create(ContextPtr) + { + return std::make_shared<FunctionArrayDistinct>(); + } + + String getName() const override + { + return name; + } + + bool isVariadic() const override { return false; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + size_t getNumberOfArguments() const override { return 1; } + + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get()); + if (!array_type) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be array but it has type {}.", + getName(), arguments[0]->getName()); + + auto nested_type = removeNullable(array_type->getNestedType()); + + return std::make_shared<DataTypeArray>(nested_type); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override; + +private: + /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess. + static constexpr size_t INITIAL_SIZE_DEGREE = 9; + + template <typename T> + static bool executeNumber( + const IColumn & src_data, + const ColumnArray::Offsets & src_offsets, + IColumn & res_data_col, + ColumnArray::Offsets & res_offsets, + const ColumnNullable * nullable_col); + + static bool executeString( + const IColumn & src_data, + const ColumnArray::Offsets & src_offsets, + IColumn & res_data_col, + ColumnArray::Offsets & res_offsets, + const ColumnNullable * nullable_col); + + static void executeHashed( + const IColumn & src_data, + const ColumnArray::Offsets & src_offsets, + IColumn & res_data_col, + ColumnArray::Offsets & res_offsets, + const ColumnNullable * nullable_col); +}; + + +ColumnPtr FunctionArrayDistinct::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const +{ + ColumnPtr array_ptr = arguments[0].column; + const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get()); + + const auto & return_type = result_type; + + auto res_ptr = return_type->createColumn(); + ColumnArray & res = assert_cast<ColumnArray &>(*res_ptr); + + const IColumn & src_data = array->getData(); + const ColumnArray::Offsets & offsets = array->getOffsets(); + + IColumn & res_data = res.getData(); + ColumnArray::Offsets & res_offsets = res.getOffsets(); + + const ColumnNullable * nullable_col = checkAndGetColumn<ColumnNullable>(src_data); + + const IColumn * inner_col; + + if (nullable_col) + { + inner_col = &nullable_col->getNestedColumn(); + } + else + { + inner_col = &src_data; + } + + if (!(executeNumber<UInt8>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<UInt16>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<UInt32>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<UInt64>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<Int8>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<Int16>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<Int32>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<Int64>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<Float32>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeNumber<Float64>(*inner_col, offsets, res_data, res_offsets, nullable_col) + || executeString(*inner_col, offsets, res_data, res_offsets, nullable_col))) + executeHashed(*inner_col, offsets, res_data, res_offsets, nullable_col); + + return res_ptr; +} + +template <typename T> +bool FunctionArrayDistinct::executeNumber( + const IColumn & src_data, + const ColumnArray::Offsets & src_offsets, + IColumn & res_data_col, + ColumnArray::Offsets & res_offsets, + const ColumnNullable * nullable_col) +{ + const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&src_data); + + if (!src_data_concrete) + { + return false; + } + + const PaddedPODArray<T> & values = src_data_concrete->getData(); + PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData(); + + const PaddedPODArray<UInt8> * src_null_map = nullptr; + + if (nullable_col) + src_null_map = &nullable_col->getNullMapData(); + + using Set = ClearableHashSetWithStackMemory<T, DefaultHash<T>, + INITIAL_SIZE_DEGREE>; + + Set set; + + ColumnArray::Offset prev_src_offset = 0; + ColumnArray::Offset res_offset = 0; + + for (auto curr_src_offset : src_offsets) + { + set.clear(); + + for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) + { + if (nullable_col && (*src_null_map)[j]) + continue; + + if (!set.find(values[j])) + { + res_data.emplace_back(values[j]); + set.insert(values[j]); + } + } + + res_offset += set.size(); + res_offsets.emplace_back(res_offset); + + prev_src_offset = curr_src_offset; + } + return true; +} + +bool FunctionArrayDistinct::executeString( + const IColumn & src_data, + const ColumnArray::Offsets & src_offsets, + IColumn & res_data_col, + ColumnArray::Offsets & res_offsets, + const ColumnNullable * nullable_col) +{ + const ColumnString * src_data_concrete = checkAndGetColumn<ColumnString>(&src_data); + + if (!src_data_concrete) + return false; + + ColumnString & res_data_column_string = typeid_cast<ColumnString &>(res_data_col); + + using Set = ClearableHashSetWithStackMemory<StringRef, StringRefHash, + INITIAL_SIZE_DEGREE>; + + const PaddedPODArray<UInt8> * src_null_map = nullptr; + + if (nullable_col) + src_null_map = &nullable_col->getNullMapData(); + + Set set; + + ColumnArray::Offset prev_src_offset = 0; + ColumnArray::Offset res_offset = 0; + + for (auto curr_src_offset : src_offsets) + { + set.clear(); + + for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) + { + if (nullable_col && (*src_null_map)[j]) + continue; + + StringRef str_ref = src_data_concrete->getDataAt(j); + + if (!set.find(str_ref)) + { + set.insert(str_ref); + res_data_column_string.insertData(str_ref.data, str_ref.size); + } + } + + res_offset += set.size(); + res_offsets.emplace_back(res_offset); + + prev_src_offset = curr_src_offset; + } + return true; +} + +void FunctionArrayDistinct::executeHashed( + const IColumn & src_data, + const ColumnArray::Offsets & src_offsets, + IColumn & res_data_col, + ColumnArray::Offsets & res_offsets, + const ColumnNullable * nullable_col) +{ + using Set = ClearableHashSetWithStackMemory<UInt128, UInt128TrivialHash, + INITIAL_SIZE_DEGREE>; + + const PaddedPODArray<UInt8> * src_null_map = nullptr; + + if (nullable_col) + src_null_map = &nullable_col->getNullMapData(); + + Set set; + + ColumnArray::Offset prev_src_offset = 0; + ColumnArray::Offset res_offset = 0; + + for (auto curr_src_offset : src_offsets) + { + set.clear(); + + for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) + { + if (nullable_col && (*src_null_map)[j]) + continue; + + SipHash hash_function; + src_data.updateHashWithValue(j, hash_function); + const auto hash = hash_function.get128(); + + if (!set.find(hash)) + { + set.insert(hash); + res_data_col.insertFrom(src_data, j); + } + } + + res_offset += set.size(); + res_offsets.emplace_back(res_offset); + + prev_src_offset = curr_src_offset; + } +} + + +REGISTER_FUNCTION(ArrayDistinct) +{ + factory.registerFunction<FunctionArrayDistinct>(); +} + +} |
