Library import 5, delete go dependencies (#832)

* Library import 5, delete go dependencies * Fix yt client
author: AlexSm <[email protected]> 2024-01-04 15:09:05 +0100
committer: GitHub <[email protected]> 2024-01-04 15:09:05 +0100
commit: dab291146f6cd7d35684e3a1150e5bb1c412982c (patch)
tree: 36ef35f6cacb6432845a4a33f940c95871036b32 /contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp
parent: 63660ad5e7512029fd0218e7a636580695a24e1f (diff)
1 files changed, 0 insertions, 157 deletions
diff --git a/contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp b/contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp
deleted file mode 100644
index 0a332ab70a9..00000000000
--- a/contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-#include <Common/FrequencyHolder.h>
-
-#if USE_NLP
-
-#include <Functions/FunctionFactory.h>
-#include <Functions/FunctionsTextClassification.h>
-
-#include <memory>
-
-
-namespace DB
-{
-
-namespace
-{
-    /* We need to solve zero-frequency problem for Naive Bayes Classifier
-     * If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
-     * 1e-06 is minimal value in our marked-up dictionary.
-     */
-    constexpr Float64 zero_frequency = 1e-06;
-
-    /// If the data size is bigger than this, behaviour is unspecified for this function.
-    constexpr size_t max_string_size = 1UL << 15;
-
-    template <typename ModelMap>
-    ALWAYS_INLINE inline Float64 naiveBayes(
-        const FrequencyHolder::EncodingMap & standard,
-        const ModelMap & model,
-        Float64 max_result)
-    {
-        Float64 res = 0;
-        for (const auto & el : model)
-        {
-            /// Try to find bigram in the dictionary.
-            const auto * it = standard.find(el.getKey());
-            if (it != standard.end())
-            {
-                res += el.getMapped() * log(it->getMapped());
-            } else
-            {
-                res += el.getMapped() * log(zero_frequency);
-            }
-            /// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
-            if (res < max_result)
-            {
-                return res;
-            }
-        }
-        return res;
-    }
-
-    /// Count how many times each bigram occurs in the text.
-    template <typename ModelMap>
-    ALWAYS_INLINE inline void calculateStats(
-        const UInt8 * data,
-        const size_t size,
-        ModelMap & model)
-    {
-        UInt16 hash = 0;
-        for (size_t i = 0; i < size; ++i)
-        {
-            hash <<= 8;
-            hash += *(data + i);
-            ++model[hash];
-        }
-    }
-}
-
-/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
- * Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
- * Using a naive Bayesian classifier, find the most likely charset and language and return it
- */
-template <bool detect_language>
-struct CharsetClassificationImpl
-{
-    static void vector(
-        const ColumnString::Chars & data,
-        const ColumnString::Offsets & offsets,
-        ColumnString::Chars & res_data,
-        ColumnString::Offsets & res_offsets)
-    {
-        const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
-
-        if constexpr (detect_language)
-            /// 2 chars for ISO code + 1 zero byte
-            res_data.reserve(offsets.size() * 3);
-        else
-            /// Mean charset length is 8
-            res_data.reserve(offsets.size() * 8);
-
-        res_offsets.resize(offsets.size());
-
-        size_t current_result_offset = 0;
-
-        double zero_frequency_log = log(zero_frequency);
-
-        for (size_t i = 0; i < offsets.size(); ++i)
-        {
-            const UInt8 * str = data.data() + offsets[i - 1];
-            const size_t str_len = offsets[i] - offsets[i - 1] - 1;
-
-            HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
-            calculateStats(str, str_len, model);
-
-            std::string_view result_value;
-
-            /// Go through the dictionary and find the charset with the highest weight
-            Float64 max_result = zero_frequency_log * (max_string_size);
-            for (const auto & item : encodings_freq)
-            {
-                Float64 score = naiveBayes(item.map, model, max_result);
-                if (max_result < score)
-                {
-                    max_result = score;
-
-                    if constexpr (detect_language)
-                        result_value = item.lang;
-                    else
-                        result_value = item.name;
-                }
-            }
-
-            size_t result_value_size = result_value.size();
-            res_data.resize(current_result_offset + result_value_size + 1);
-            memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
-            res_data[current_result_offset + result_value_size] = '\0';
-            current_result_offset += result_value_size + 1;
-
-            res_offsets[i] = current_result_offset;
-        }
-    }
-};
-
-
-struct NameDetectCharset
-{
-    static constexpr auto name = "detectCharset";
-};
-
-struct NameDetectLanguageUnknown
-{
-    static constexpr auto name = "detectLanguageUnknown";
-};
-
-
-using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
-using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
-
-REGISTER_FUNCTION(DetectCharset)
-{
-    factory.registerFunction<FunctionDetectCharset>();
-    factory.registerFunction<FunctionDetectLanguageUnknown>();
-}
-
-}
-
-#endif
author	AlexSm <[email protected]>	2024-01-04 15:09:05 +0100
committer	GitHub <[email protected]>	2024-01-04 15:09:05 +0100
commit	dab291146f6cd7d35684e3a1150e5bb1c412982c (patch)
tree	36ef35f6cacb6432845a4a33f940c95871036b32 /contrib/clickhouse/src/Functions/FunctionsCharsetClassification.cpp
parent	63660ad5e7512029fd0218e7a636580695a24e1f (diff)