diff options
| author | vitalyisaev <[email protected]> | 2023-11-14 09:58:56 +0300 |
|---|---|---|
| committer | vitalyisaev <[email protected]> | 2023-11-14 10:20:20 +0300 |
| commit | c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch) | |
| tree | cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/regexpExtract.cpp | |
| parent | d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff) | |
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/Functions/regexpExtract.cpp')
| -rw-r--r-- | contrib/clickhouse/src/Functions/regexpExtract.cpp | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Functions/regexpExtract.cpp b/contrib/clickhouse/src/Functions/regexpExtract.cpp new file mode 100644 index 00000000000..2b3f0b2088d --- /dev/null +++ b/contrib/clickhouse/src/Functions/regexpExtract.cpp @@ -0,0 +1,253 @@ +#include <Columns/ColumnConst.h> +#include <Columns/ColumnString.h> +#include <DataTypes/DataTypeString.h> +#include <DataTypes/DataTypesNumber.h> +#include <Functions/FunctionFactory.h> +#include <Functions/FunctionHelpers.h> +#include <Functions/IFunction.h> +#include <Functions/Regexps.h> +#include <Interpreters/Context.h> +#include <base/StringRef.h> +#include <Common/FunctionDocumentation.h> + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_COLUMN; + extern const int INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE; +} + +namespace +{ +class FunctionRegexpExtract : public IFunction +{ +public: + static constexpr auto name = "regexpExtract"; + static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionRegexpExtract>(); } + + String getName() const override { return name; } + + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() != 2 && arguments.size() != 3) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}", + getName(), + arguments.size()); + + FunctionArgumentDescriptors args{ + {"haystack", &isString<IDataType>, nullptr, "String"}, + {"pattern", &isString<IDataType>, isColumnConst, "const String"}, + }; + + if (arguments.size() == 3) + args.emplace_back(FunctionArgumentDescriptor{"index", &isInteger<IDataType>, nullptr, "Integer"}); + + validateFunctionArgumentTypes(*this, arguments, args); + + return std::make_shared<DataTypeString>(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override + { + const ColumnPtr column = arguments[0].column; + const ColumnPtr column_pattern = arguments[1].column; + const ColumnPtr column_index = arguments.size() > 2 ? arguments[2].column : nullptr; + + /// Check if the second argument is const column + const ColumnConst * col_pattern = typeid_cast<const ColumnConst *>(column_pattern.get()); + if (!col_pattern) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument of function {} must be constant string", getName()); + + /// Check if the first argument is string column(const or not) + const ColumnConst * col_const = typeid_cast<const ColumnConst *>(column.get()); + const ColumnString * col = nullptr; + if (col_const) + col = typeid_cast<const ColumnString *>(&col_const->getDataColumn()); + else + col = typeid_cast<const ColumnString *>(column.get()); + if (!col) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName()); + + auto col_res = ColumnString::create(); + ColumnString::Chars & vec_res = col_res->getChars(); + ColumnString::Offsets & offsets_res = col_res->getOffsets(); + + if (col_const) + constantVector(col_const->getValue<String>(), col_pattern->getValue<String>(), column_index, vec_res, offsets_res); + else if (!column_index || isColumnConst(*column_index)) + { + const auto * col_const_index = typeid_cast<const ColumnConst *>(column_index.get()); + ssize_t index = !col_const_index ? 1 : col_const_index->getInt(0); + vectorConstant(col->getChars(), col->getOffsets(), col_pattern->getValue<String>(), index, vec_res, offsets_res); + } + else + vectorVector(col->getChars(), col->getOffsets(), col_pattern->getValue<String>(), column_index, vec_res, offsets_res); + + return col_res; + } + +private: + static void saveMatch( + const OptimizedRegularExpression::MatchVec & matches, + size_t match_index, + const ColumnString::Chars & data, + size_t data_offset, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + size_t & res_offset) + { + if (match_index < matches.size() && matches[match_index].offset != std::string::npos) + { + const auto & match = matches[match_index]; + res_data.resize(res_offset + match.length + 1); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], &data[data_offset + match.offset], match.length); + res_offset += match.length; + } + else + res_data.resize(res_offset + 1); + + res_data[res_offset] = 0; + ++res_offset; + res_offsets.push_back(res_offset); + } + + static void vectorConstant( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + const std::string & pattern, + ssize_t index, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + const Regexps::Regexp regexp = Regexps::createRegexp<false, false, false>(pattern); + unsigned capture = regexp.getNumberOfSubpatterns(); + if (index < 0 || index >= capture + 1) + throw Exception( + ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, + "Index value {} is out of range, should be in [0, {})", + index, + capture + 1); + + OptimizedRegularExpression::MatchVec matches; + matches.reserve(index + 1); + + res_data.reserve(data.size() / 5); + res_offsets.reserve(offsets.size()); + size_t prev_offset = 0; + size_t res_offset = 0; + for (size_t cur_offset : offsets) + { + regexp.match( + reinterpret_cast<const char *>(&data[prev_offset]), + cur_offset - prev_offset - 1, + matches, + static_cast<unsigned>(index + 1)); + + saveMatch(matches, index, data, prev_offset, res_data, res_offsets, res_offset); + prev_offset = cur_offset; + } + } + + static void vectorVector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + const std::string & pattern, + const ColumnPtr & column_index, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size() / 5); + res_offsets.reserve(offsets.size()); + + const Regexps::Regexp regexp = Regexps::createRegexp<false, false, false>(pattern); + unsigned capture = regexp.getNumberOfSubpatterns(); + + OptimizedRegularExpression::MatchVec matches; + matches.reserve(capture + 1); + size_t prev_offset = 0; + size_t res_offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) + { + size_t cur_offset = offsets[i]; + + ssize_t index = column_index->getInt(i); + if (index < 0 || index >= capture + 1) + throw Exception( + ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, + "Index value {} is out of range, should be in [0, {})", + index, + capture + 1); + + regexp.match( + reinterpret_cast<const char *>(&data[prev_offset]), + cur_offset - prev_offset - 1, + matches, + static_cast<unsigned>(index + 1)); + + saveMatch(matches, index, data, prev_offset, res_data, res_offsets, res_offset); + prev_offset = cur_offset; + } + } + + static void constantVector( + const std::string & str, + const std::string & pattern, + const ColumnPtr & column_index, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t rows = column_index->size(); + res_data.reserve(str.size() / 5); + res_offsets.reserve(rows); + + /// Copy data into padded array to be able to use memcpySmallAllowReadWriteOverflow15. + ColumnString::Chars padded_str; + padded_str.insert(str.begin(), str.end()); + + const Regexps::Regexp regexp = Regexps::createRegexp<false, false, false>(pattern); + unsigned capture = regexp.getNumberOfSubpatterns(); + OptimizedRegularExpression::MatchVec matches; + matches.reserve(capture + 1); + regexp.match(reinterpret_cast<const char *>(padded_str.data()), padded_str.size(), matches, static_cast<unsigned>(capture + 1)); + + size_t res_offset = 0; + for (size_t i = 0; i < rows; ++i) + { + ssize_t index = column_index->getInt(i); + if (index < 0 || index >= capture + 1) + throw Exception( + ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, + "Index value {} is out of range, should be in [0, {})", + index, + capture + 1); + + saveMatch(matches, index, padded_str, 0, res_data, res_offsets, res_offset); + } + } +}; + +} + +REGISTER_FUNCTION(RegexpExtract) +{ + factory.registerFunction<FunctionRegexpExtract>( + FunctionDocumentation{.description="Extracts the first string in haystack that matches the regexp pattern and corresponds to the regex group index."}); + + /// For Spark compatibility. + factory.registerAlias("REGEXP_EXTRACT", "regexpExtract", FunctionFactory::CaseInsensitive); +} + +} |
