aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/multitokenutil.h
blob: 19781f67858a9c89b5fdce158f2edc052c1dee76 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#pragma once

#include <library/cpp/token/nlptypes.h>
#include <library/cpp/token/token_structure.h>

void CorrectDelimiters(TCharSpan& prevtok, wchar16 suffixChar, TCharSpan& lasttok, wchar16 prefixChar);

//! removes hyphenations and replaces unicode delimiters
//! @return new length of multitoken
NLP_TYPE PrepareMultitoken(TTokenStructure& subtokens, wchar16* buffer, size_t buflen, const wchar16* entry, size_t& len);

//! cuts off the subtokens according to the specified maximum length
//! @return new length of the subtokens
size_t AdjustSubtokens(TTokenStructure& subtokens, size_t maxLen);

//! corrects positions of subtokens and cuts off their length according to the specified maximum
//! @note the first @c n characters are accents
//! @return new length of the subtokens
size_t AdjustSubtokens(TTokenStructure& subtokens, size_t n, size_t maxLen);

//! for debugging purposes only
bool CheckMultitoken(const TWideToken& tok);