aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/FunctionsTonalityClassification.cpp
blob: 3de38d99c88b766eb51e91f8d19e861c8ad546ae (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include <Common/FrequencyHolder.h>

#if USE_NLP

#include <Common/StringUtils/StringUtils.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsTextClassification.h>

#include <unordered_map>

namespace DB
{

/**
  * Determines the sentiment of text data.
  * Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6.
  * For each text, calculate the average sentiment value of its words and return it in range [-1,1]
  */
struct FunctionDetectTonalityImpl
{
    static ALWAYS_INLINE inline Float32 detectTonality(
        const UInt8 * str,
        const size_t str_len,
        const FrequencyHolder::Map & emotional_dict)
    {
        Float64 weight = 0;
        UInt64 count_words = 0;

        String word;
        /// Select all words from the string
        for (size_t ind = 0; ind < str_len; ++ind)
        {
            /// Split words by whitespaces and punctuation signs
            if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))
                continue;

            while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])))
            {
                word.push_back(str[ind]);
                ++ind;
            }
            /// Try to find a word in the tonality dictionary
            const auto * it = emotional_dict.find(word);
            if (it != emotional_dict.end())
            {
                count_words += 1;
                weight += it->getMapped();
            }
            word.clear();
        }

        if (!count_words)
            return 0;

        /// Calculate average value of tonality.
        /// Convert values -12..6 to -1..1
        if (weight > 0)
            return static_cast<Float32>(weight / count_words / 6);
        else
            return static_cast<Float32>(weight / count_words / 12);
    }

    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        PaddedPODArray<Float32> & res)
    {
        const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();

        size_t size = offsets.size();
        size_t prev_offset = 0;
        for (size_t i = 0; i < size; ++i)
        {
            res[i] = detectTonality(data.data() + prev_offset, offsets[i] - 1 - prev_offset, emotional_dict);
            prev_offset = offsets[i];
        }
    }
};

struct NameDetectTonality
{
    static constexpr auto name = "detectTonality";
};

using FunctionDetectTonality = FunctionTextClassificationFloat<FunctionDetectTonalityImpl, NameDetectTonality>;

REGISTER_FUNCTION(DetectTonality)
{
    factory.registerFunction<FunctionDetectTonality>();
}

}

#endif