diff options
| author | vitalyisaev <[email protected]> | 2023-11-14 09:58:56 +0300 |
|---|---|---|
| committer | vitalyisaev <[email protected]> | 2023-11-14 10:20:20 +0300 |
| commit | c2b2dfd9827a400a8495e172a56343462e3ceb82 (patch) | |
| tree | cd4e4f597d01bede4c82dffeb2d780d0a9046bd0 /contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp | |
| parent | d4ae8f119e67808cb0cf776ba6e0cf95296f2df7 (diff) | |
YQ Connector: move tests from yql to ydb (OSS)
Перенос папки с тестами на Коннектор из папки yql в папку ydb (синхронизируется с github).
Diffstat (limited to 'contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp')
| -rw-r--r-- | contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp b/contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp new file mode 100644 index 00000000000..0a332ab70a9 --- /dev/null +++ b/contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp @@ -0,0 +1,157 @@ +#include <Common/FrequencyHolder.h> + +#if USE_NLP + +#include <Functions/FunctionFactory.h> +#include <Functions/FunctionsTextClassification.h> + +#include <memory> + + +namespace DB +{ + +namespace +{ + /* We need to solve zero-frequency problem for Naive Bayes Classifier + * If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06. + * 1e-06 is minimal value in our marked-up dictionary. + */ + constexpr Float64 zero_frequency = 1e-06; + + /// If the data size is bigger than this, behaviour is unspecified for this function. + constexpr size_t max_string_size = 1UL << 15; + + template <typename ModelMap> + ALWAYS_INLINE inline Float64 naiveBayes( + const FrequencyHolder::EncodingMap & standard, + const ModelMap & model, + Float64 max_result) + { + Float64 res = 0; + for (const auto & el : model) + { + /// Try to find bigram in the dictionary. + const auto * it = standard.find(el.getKey()); + if (it != standard.end()) + { + res += el.getMapped() * log(it->getMapped()); + } else + { + res += el.getMapped() * log(zero_frequency); + } + /// If at some step the result has become less than the current maximum, then it makes no sense to count it fully. + if (res < max_result) + { + return res; + } + } + return res; + } + + /// Count how many times each bigram occurs in the text. + template <typename ModelMap> + ALWAYS_INLINE inline void calculateStats( + const UInt8 * data, + const size_t size, + ModelMap & model) + { + UInt16 hash = 0; + for (size_t i = 0; i < size; ++i) + { + hash <<= 8; + hash += *(data + i); + ++model[hash]; + } + } +} + +/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes. + * Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets. + * Using a naive Bayesian classifier, find the most likely charset and language and return it + */ +template <bool detect_language> +struct CharsetClassificationImpl +{ + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency(); + + if constexpr (detect_language) + /// 2 chars for ISO code + 1 zero byte + res_data.reserve(offsets.size() * 3); + else + /// Mean charset length is 8 + res_data.reserve(offsets.size() * 8); + + res_offsets.resize(offsets.size()); + + size_t current_result_offset = 0; + + double zero_frequency_log = log(zero_frequency); + + for (size_t i = 0; i < offsets.size(); ++i) + { + const UInt8 * str = data.data() + offsets[i - 1]; + const size_t str_len = offsets[i] - offsets[i - 1] - 1; + + HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model; + calculateStats(str, str_len, model); + + std::string_view result_value; + + /// Go through the dictionary and find the charset with the highest weight + Float64 max_result = zero_frequency_log * (max_string_size); + for (const auto & item : encodings_freq) + { + Float64 score = naiveBayes(item.map, model, max_result); + if (max_result < score) + { + max_result = score; + + if constexpr (detect_language) + result_value = item.lang; + else + result_value = item.name; + } + } + + size_t result_value_size = result_value.size(); + res_data.resize(current_result_offset + result_value_size + 1); + memcpy(&res_data[current_result_offset], result_value.data(), result_value_size); + res_data[current_result_offset + result_value_size] = '\0'; + current_result_offset += result_value_size + 1; + + res_offsets[i] = current_result_offset; + } + } +}; + + +struct NameDetectCharset +{ + static constexpr auto name = "detectCharset"; +}; + +struct NameDetectLanguageUnknown +{ + static constexpr auto name = "detectLanguageUnknown"; +}; + + +using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>; +using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>; + +REGISTER_FUNCTION(DetectCharset) +{ + factory.registerFunction<FunctionDetectCharset>(); + factory.registerFunction<FunctionDetectLanguageUnknown>(); +} + +} + +#endif |
