aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Common/FrequencyHolder.h
blob: 687fa0053cad3765e08656ace56cb5350c4d5239 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#pragma once

#include "clickhouse_config.h"

#if USE_NLP

#include <base/StringRef.h>
#include <Common/logger_useful.h>

#include <string_view>
#include <unordered_map>

#include <Common/Arena.h>
#include <Common/HashTable/HashMap.h>
#include <Common/StringUtils/StringUtils.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/readFloatText.h>
#include <IO/ZstdInflatingReadBuffer.h>


namespace DB
{

/// FrequencyHolder class is responsible for storing and loading dictionaries
/// needed for text classification functions:
///
/// 1. detectLanguageUnknown
/// 2. detectCharset
/// 3. detectTonality
/// 4. detectProgrammingLanguage

class FrequencyHolder
{
public:
    struct Language
    {
        String name;
        HashMap<StringRef, Float64> map;
    };

    struct Encoding
    {
        String name;
        String lang;
        HashMap<UInt16, Float64> map;
    };

public:
    using Map = HashMap<StringRef, Float64>;
    using Container = std::vector<Language>;

    using EncodingMap = HashMap<UInt16, Float64>;
    using EncodingContainer = std::vector<Encoding>;

    static FrequencyHolder & getInstance();

    const Map & getEmotionalDict() const
    {
        return emotional_dict;
    }

    const EncodingContainer & getEncodingsFrequency() const
    {
        return encodings_freq;
    }

    const Container & getProgrammingFrequency() const
    {
        return programming_freq;
    }

private:
    FrequencyHolder();

    void loadEncodingsFrequency();
    void loadEmotionalDict();
    void loadProgrammingFrequency();

    Arena string_pool;

    Map emotional_dict;
    Container programming_freq;
    EncodingContainer encodings_freq;
};
}

#endif