diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/special_tokens.cpp | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/special_tokens.cpp')
-rw-r--r-- | library/cpp/tokenizer/special_tokens.cpp | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/special_tokens.cpp b/library/cpp/tokenizer/special_tokens.cpp new file mode 100644 index 00000000000..141ea9ac065 --- /dev/null +++ b/library/cpp/tokenizer/special_tokens.cpp @@ -0,0 +1,27 @@ +#include "special_tokens.h" + +#include <library/cpp/containers/comptrie/set.h> + +#include <util/generic/singleton.h> + +namespace { + extern "C" { + extern const unsigned char SpecialTokens[]; + extern const ui32 SpecialTokensSize; + } + + class TSpecialTokensSet: public TCompactTrieSet<wchar16> { + public: + TSpecialTokensSet(): TCompactTrieSet<wchar16>(reinterpret_cast<const char*>(SpecialTokens), SpecialTokensSize) + { + } + }; + + auto SpecialTokensSet = Singleton<TSpecialTokensSet>(); +} + +size_t GetSpecialTokenLength(const wchar16* text, size_t maxLen) { + size_t resultLen = 0; + SpecialTokensSet->FindLongestPrefix(text, maxLen, &resultLen); + return resultLen; +} |