summaryrefslogtreecommitdiffstats
path: root/contrib/clickhouse/src/Functions/soundex.cpp
blob: 0cddfc90f7c7d82935053cfd14310ef92d759ced (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#include <cctype>

#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Common/StringUtils/StringUtils.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int ILLEGAL_COLUMN;
}
/** Soundex algorithm, https://en.wikipedia.org/wiki/Soundex
  * Implemented similarly as in most SQL dialects:
  * 1. Save the first letter. Map all occurrences of a, e, i, o, u, y, h, w. to zero(0)
  * 2. Replace all consonants (include the first letter) with digits as follows:
  *  - b, f, p, v → 1
  *  - c, g, j, k, q, s, x, z → 2
  *  - d, t → 3
  *  - l → 4
  *  - m, n → 5
  *  - r → 6
  * 3. Replace all adjacent same digits with one digit, and then remove all the zero (0) digits
  * 4. If the saved letter's digit is the same as the resulting first digit, remove the digit (keep the letter).
  * 5. Append 3 zeros if result contains less than 3 digits. Remove all except first letter and 3 digits after it.
  */

struct SoundexImpl
{
    static constexpr auto length = 4z;
    static constexpr auto soundex_map = "01230120022455012623010202";

    static void calculate(const char * value, size_t value_length, char * out)
    {
        const char * cur = value;
        const char * const end = value + value_length;
        char * const out_end = out + length;

        while (cur < end && !isAlphaASCII(*cur))
            ++cur;

        char prev_code = '0';
        if (cur < end)
        {
            *out = toUpperIfAlphaASCII(*cur);
            ++out;
            prev_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
            ++cur;
        }

        while (cur < end && !isAlphaASCII(*cur))
            ++cur;

        while (cur < end && out < out_end)
        {
            char current_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
            if ((current_code != '0') && (current_code != prev_code))
            {
                *out = current_code;
                ++out;
            }
            prev_code = current_code;
            ++cur;

            while (cur < end && !isAlphaASCII(*cur))
                ++cur;
        }

        while (out < out_end)
        {
            *out = '0';
            ++out;
        }
    }

    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets)
    {
        const size_t size = offsets.size();
        res_data.resize(size * (length + 1));
        res_offsets.resize(size);

        size_t prev_offset = 0;
        for (size_t i = 0; i < size; ++i)
        {
            const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
            const size_t value_length = offsets[i] - prev_offset - 1;
            const size_t out_index = i * (length + 1);
            calculate(value, value_length, reinterpret_cast<char *>(&res_data[out_index]));
            res_data[out_index + length] = '\0';
            res_offsets[i] = (out_index + length + 1);
            prev_offset = offsets[i];
        }
    }

    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
    {
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by soundex function");
    }
};

struct NameSoundex
{
    static constexpr auto name = "soundex";
};

REGISTER_FUNCTION(Soundex)
{
    factory.registerFunction<FunctionStringToString<SoundexImpl, NameSoundex>>(
        FunctionDocumentation{.description="Returns Soundex code of a string."}, FunctionFactory::CaseInsensitive);
}


}