blob: 3de38d99c88b766eb51e91f8d19e861c8ad546ae (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
#include <Common/FrequencyHolder.h>
#if USE_NLP
#include <Common/StringUtils/StringUtils.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsTextClassification.h>
#include <unordered_map>
namespace DB
{
/**
* Determines the sentiment of text data.
* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6.
* For each text, calculate the average sentiment value of its words and return it in range [-1,1]
*/
struct FunctionDetectTonalityImpl
{
static ALWAYS_INLINE inline Float32 detectTonality(
const UInt8 * str,
const size_t str_len,
const FrequencyHolder::Map & emotional_dict)
{
Float64 weight = 0;
UInt64 count_words = 0;
String word;
/// Select all words from the string
for (size_t ind = 0; ind < str_len; ++ind)
{
/// Split words by whitespaces and punctuation signs
if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))
continue;
while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])))
{
word.push_back(str[ind]);
++ind;
}
/// Try to find a word in the tonality dictionary
const auto * it = emotional_dict.find(word);
if (it != emotional_dict.end())
{
count_words += 1;
weight += it->getMapped();
}
word.clear();
}
if (!count_words)
return 0;
/// Calculate average value of tonality.
/// Convert values -12..6 to -1..1
if (weight > 0)
return static_cast<Float32>(weight / count_words / 6);
else
return static_cast<Float32>(weight / count_words / 12);
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
PaddedPODArray<Float32> & res)
{
const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
size_t size = offsets.size();
size_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
res[i] = detectTonality(data.data() + prev_offset, offsets[i] - 1 - prev_offset, emotional_dict);
prev_offset = offsets[i];
}
}
};
struct NameDetectTonality
{
static constexpr auto name = "detectTonality";
};
using FunctionDetectTonality = FunctionTextClassificationFloat<FunctionDetectTonalityImpl, NameDetectTonality>;
REGISTER_FUNCTION(DetectTonality)
{
factory.registerFunction<FunctionDetectTonality>();
}
}
#endif
|