blob: 687fa0053cad3765e08656ace56cb5350c4d5239 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#pragma once
#include "clickhouse_config.h"
#if USE_NLP
#include <base/StringRef.h>
#include <Common/logger_useful.h>
#include <string_view>
#include <unordered_map>
#include <Common/Arena.h>
#include <Common/HashTable/HashMap.h>
#include <Common/StringUtils/StringUtils.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/readFloatText.h>
#include <IO/ZstdInflatingReadBuffer.h>
namespace DB
{
/// FrequencyHolder class is responsible for storing and loading dictionaries
/// needed for text classification functions:
///
/// 1. detectLanguageUnknown
/// 2. detectCharset
/// 3. detectTonality
/// 4. detectProgrammingLanguage
class FrequencyHolder
{
public:
struct Language
{
String name;
HashMap<StringRef, Float64> map;
};
struct Encoding
{
String name;
String lang;
HashMap<UInt16, Float64> map;
};
public:
using Map = HashMap<StringRef, Float64>;
using Container = std::vector<Language>;
using EncodingMap = HashMap<UInt16, Float64>;
using EncodingContainer = std::vector<Encoding>;
static FrequencyHolder & getInstance();
const Map & getEmotionalDict() const
{
return emotional_dict;
}
const EncodingContainer & getEncodingsFrequency() const
{
return encodings_freq;
}
const Container & getProgrammingFrequency() const
{
return programming_freq;
}
private:
FrequencyHolder();
void loadEncodingsFrequency();
void loadEmotionalDict();
void loadProgrammingFrequency();
Arena string_pool;
Map emotional_dict;
Container programming_freq;
EncodingContainer encodings_freq;
};
}
#endif
|