blob: aa0e1b048352524f1ef64ea29be2d75774e428e7 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#pragma once
#include <Common/PODArray.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/UTF8Helpers.h>
#include <algorithm>
#include <climits>
#include <cstring>
#include <memory>
#include <utility>
#ifdef __SSE4_2__
# include <nmmintrin.h>
#endif
namespace DB
{
// used by FunctionsStringSimilarity and FunctionsStringHash
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
struct ExtractStringImpl
{
static ALWAYS_INLINE inline const UInt8 * readOneWord(const UInt8 *& pos, const UInt8 * end)
{
// jump separators
while (pos < end && isUTF8Sep(*pos))
++pos;
// word start from here
const UInt8 * word_start = pos;
while (pos < end && !isUTF8Sep(*pos))
++pos;
return word_start;
}
// we use ASCII non-alphanum character as UTF8 separator
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
// read one UTF8 character
static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end)
{
size_t length = UTF8::seqLength(*pos);
if (pos + length > end)
length = end - pos;
pos += length;
}
};
}
|