aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/special_tokens.cpp
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/special_tokens.cpp
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/special_tokens.cpp')
-rw-r--r--library/cpp/tokenizer/special_tokens.cpp27
1 files changed, 27 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/special_tokens.cpp b/library/cpp/tokenizer/special_tokens.cpp
new file mode 100644
index 00000000000..141ea9ac065
--- /dev/null
+++ b/library/cpp/tokenizer/special_tokens.cpp
@@ -0,0 +1,27 @@
+#include "special_tokens.h"
+
+#include <library/cpp/containers/comptrie/set.h>
+
+#include <util/generic/singleton.h>
+
+namespace {
+ extern "C" {
+ extern const unsigned char SpecialTokens[];
+ extern const ui32 SpecialTokensSize;
+ }
+
+ class TSpecialTokensSet: public TCompactTrieSet<wchar16> {
+ public:
+ TSpecialTokensSet(): TCompactTrieSet<wchar16>(reinterpret_cast<const char*>(SpecialTokens), SpecialTokensSize)
+ {
+ }
+ };
+
+ auto SpecialTokensSet = Singleton<TSpecialTokensSet>();
+}
+
+size_t GetSpecialTokenLength(const wchar16* text, size_t maxLen) {
+ size_t resultLen = 0;
+ SpecialTokensSet->FindLongestPrefix(text, maxLen, &resultLen);
+ return resultLen;
+}