aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Interpreters/Lemmatizers.cpp
blob: 0f94155d37a2009a9e0af685b449d002e87477ed (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include "clickhouse_config.h"

#if USE_NLP

#include <Common/Exception.h>
#include <Interpreters/Lemmatizers.h>
#error #include <RdrLemmatizer.h>

#include <vector>
#include <filesystem>

namespace DB
{

namespace ErrorCodes
{
    extern const int UNKNOWN_ELEMENT_IN_CONFIG;
    extern const int INVALID_CONFIG_PARAMETER;
}


class Lemmatizer : public ILemmatizer
{
private:
    RdrLemmatizer lemmatizer;

public:
    explicit Lemmatizer(const String & path) : lemmatizer(path.data()) {}

    TokenPtr lemmatize(const char * token) override
    {
        return TokenPtr(lemmatizer.Lemmatize(token));
    }
};

/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
static bool startsWith(const std::string & s, const char * prefix)
{
    return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
}

Lemmatizers::Lemmatizers(const Poco::Util::AbstractConfiguration & config)
{
    String prefix = "lemmatizers";
    Poco::Util::AbstractConfiguration::Keys keys;

    if (!config.has(prefix))
        throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "No lemmatizers specified in server config on prefix '{}'", prefix);

    config.keys(prefix, keys);

    for (const auto & key : keys)
    {
        if (startsWith(key, "lemmatizer"))
        {
            const auto & lemm_name = config.getString(prefix + "." + key + ".lang", "");
            const auto & lemm_path = config.getString(prefix + "." + key + ".path", "");

            if (lemm_name.empty())
                throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Lemmatizer language in config is not specified here: "
                    "{}.{}.lang", prefix, key);
            if (lemm_path.empty())
                throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Path to lemmatizer in config is not specified here: {}.{}.path",
                    prefix, key);

            paths[lemm_name] = lemm_path;
        }
        else
            throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Unknown element in config: {}.{}, must be 'lemmatizer'",
                prefix, key);
    }
}

Lemmatizers::LemmPtr Lemmatizers::getLemmatizer(const String & name)
{
    std::lock_guard guard(mutex);

    if (lemmatizers.find(name) != lemmatizers.end())
        return lemmatizers[name];

    if (paths.find(name) != paths.end())
    {
        if (!std::filesystem::exists(paths[name]))
            throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Incorrect path to lemmatizer: {}", paths[name]);

        lemmatizers[name] = std::make_shared<Lemmatizer>(paths[name]);
        return lemmatizers[name];
    }

    throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Lemmatizer named: '{}' is not found", name);
}

}

#endif