aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Columns/Collator.cpp
blob: 2c12267b5e164f71b9d7a1e1827a74548be817f3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#include <Columns/Collator.h>

#include "clickhouse_config.h"

#if USE_ICU
#    include <unicode/locid.h>
#    include <unicode/ucnv.h>
#    include <unicode/ucol.h>
#    include <unicode/unistr.h>
#else
#    if defined(__clang__)
#        pragma clang diagnostic ignored "-Wunused-private-field"
#        pragma clang diagnostic ignored "-Wmissing-noreturn"
#    endif
#endif

#include <Common/Exception.h>
#include <Poco/String.h>
#include <algorithm>
#include <base/sort.h>


namespace DB
{
    namespace ErrorCodes
    {
        extern const int UNSUPPORTED_COLLATION_LOCALE;
        extern const int COLLATION_COMPARISON_FAILED;
        extern const int SUPPORT_IS_DISABLED;
    }
}


AvailableCollationLocales::AvailableCollationLocales()
{
#if USE_ICU
    static const size_t MAX_LANG_LENGTH = 128;
    size_t available_locales_count = ucol_countAvailable();
    for (size_t i = 0; i < available_locales_count; ++i)
    {
        std::string locale_name = ucol_getAvailable(i);
        UChar lang_buffer[MAX_LANG_LENGTH];
        char normal_buf[MAX_LANG_LENGTH];
        UErrorCode status = U_ZERO_ERROR;

        /// All names will be in English language
        size_t lang_length = uloc_getDisplayLanguage(
            locale_name.c_str(), "en", lang_buffer, MAX_LANG_LENGTH, &status);
        std::optional<std::string> lang;

        if (!U_FAILURE(status))
        {
            /// Convert language name from UChar array to normal char array.
            /// We use English language for name, so all UChar's length is equal to sizeof(char)
            u_UCharsToChars(lang_buffer, normal_buf, lang_length);
            lang.emplace(std::string(normal_buf, lang_length));
        }

        locales_map.emplace(Poco::toLower(locale_name), LocaleAndLanguage{locale_name, lang});
    }

#endif
}

const AvailableCollationLocales & AvailableCollationLocales::instance()
{
    static AvailableCollationLocales instance;
    return instance;
}

AvailableCollationLocales::LocalesVector AvailableCollationLocales::getAvailableCollations() const
{
    LocalesVector result;
    for (const auto & name_and_locale : locales_map)
        result.push_back(name_and_locale.second);

    auto comparator = [] (const LocaleAndLanguage & f, const LocaleAndLanguage & s)
    {
        return f.locale_name < s.locale_name;
    };
    ::sort(result.begin(), result.end(), comparator);

    return result;
}

bool AvailableCollationLocales::isCollationSupported(const std::string & locale_name) const
{
    /// We support locale names in any case, so we have to convert all to lower case
    return locales_map.contains(Poco::toLower(locale_name));
}

Collator::Collator(const std::string & locale_)
    : locale(Poco::toLower(locale_))
{
#if USE_ICU
    /// We check it here, because ucol_open will fallback to default locale for
    /// almost all random names.
    if (!AvailableCollationLocales::instance().isCollationSupported(locale))
        throw DB::Exception(DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE, "Unsupported collation locale: {}", locale);

    UErrorCode status = U_ZERO_ERROR;

    collator = ucol_open(locale.c_str(), &status);
    if (U_FAILURE(status))
    {
        ucol_close(collator);
        throw DB::Exception(DB::ErrorCodes::UNSUPPORTED_COLLATION_LOCALE, "Failed to open locale: {} with error: {}", locale, u_errorName(status));
    }
#else
    throw DB::Exception(DB::ErrorCodes::SUPPORT_IS_DISABLED,
                        "Collations support is disabled, because ClickHouse was built without ICU library");
#endif
}


Collator::~Collator() // NOLINT
{
#if USE_ICU
    ucol_close(collator);
#endif
}

int Collator::compare(const char * str1, size_t length1, const char * str2, size_t length2) const
{
#if USE_ICU
    UCharIterator iter1, iter2;
    uiter_setUTF8(&iter1, str1, length1);
    uiter_setUTF8(&iter2, str2, length2);

    UErrorCode status = U_ZERO_ERROR;
    UCollationResult compare_result = ucol_strcollIter(collator, &iter1, &iter2, &status);

    if (U_FAILURE(status))
        throw DB::Exception(DB::ErrorCodes::COLLATION_COMPARISON_FAILED, "ICU collation comparison failed with error code: {}",
                            std::string(u_errorName(status)));

    /** Values of enum UCollationResult are equals to what exactly we need:
     *     UCOL_EQUAL = 0
     *     UCOL_GREATER = 1
     *     UCOL_LESS = -1
     */
    return compare_result;
#else
    (void)str1;
    (void)length1;
    (void)str2;
    (void)length2;
    return 0;
#endif
}

const std::string & Collator::getLocale() const
{
    return locale;
}