diff options
author | neksard <neksard@yandex-team.ru> | 2022-02-10 16:45:23 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:23 +0300 |
commit | 8f7cf138264e0caa318144bf8a2c950e0b0a8593 (patch) | |
tree | 83bf5c8c8047c42d8475e6095df90ccdc3d1b57f /contrib/libs/icu/i18n/csrmbcs.h | |
parent | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (diff) | |
download | ydb-8f7cf138264e0caa318144bf8a2c950e0b0a8593.tar.gz |
Restoring authorship annotation for <neksard@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/icu/i18n/csrmbcs.h')
-rw-r--r-- | contrib/libs/icu/i18n/csrmbcs.h | 412 |
1 files changed, 206 insertions, 206 deletions
diff --git a/contrib/libs/icu/i18n/csrmbcs.h b/contrib/libs/icu/i18n/csrmbcs.h index 8ccf1d56a9..ce4f5dbee4 100644 --- a/contrib/libs/icu/i18n/csrmbcs.h +++ b/contrib/libs/icu/i18n/csrmbcs.h @@ -1,207 +1,207 @@ // © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* - ********************************************************************** - * Copyright (C) 2005-2012, International Business Machines - * Corporation and others. All Rights Reserved. - ********************************************************************** - */ - -#ifndef __CSRMBCS_H -#define __CSRMBCS_H - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_CONVERSION - -#include "csrecog.h" - -U_NAMESPACE_BEGIN - -// "Character" iterated character class. -// Recognizers for specific mbcs encodings make their "characters" available -// by providing a nextChar() function that fills in an instance of IteratedChar -// with the next char from the input. -// The returned characters are not converted to Unicode, but remain as the raw -// bytes (concatenated into an int) from the codepage data. -// -// For Asian charsets, use the raw input rather than the input that has been -// stripped of markup. Detection only considers multi-byte chars, effectively -// stripping markup anyway, and double byte chars do occur in markup too. -// -class IteratedChar : public UMemory -{ -public: - uint32_t charValue; // 1-4 bytes from the raw input data - int32_t index; - int32_t nextIndex; - UBool error; - UBool done; - -public: - IteratedChar(); - //void reset(); - int32_t nextByte(InputText* det); -}; - - -class CharsetRecog_mbcs : public CharsetRecognizer { - -protected: - /** - * Test the match of this charset with the input text data - * which is obtained via the CharsetDetector object. - * - * @param det The CharsetDetector, which contains the input text - * to be checked for being in this charset. - * @return Two values packed into one int (Damn java, anyhow) - * <br/> - * bits 0-7: the match confidence, ranging from 0-100 - * <br/> - * bits 8-15: The match reason, an enum-like value. - */ - int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; - -public: - - virtual ~CharsetRecog_mbcs(); - - /** - * Get the IANA name of this charset. - * @return the charset name. - */ - - const char *getName() const = 0; - const char *getLanguage() const = 0; - UBool match(InputText* input, CharsetMatch *results) const = 0; - - /** - * Get the next character (however many bytes it is) from the input data - * Subclasses for specific charset encodings must implement this function - * to get characters according to the rules of their encoding scheme. - * - * This function is not a method of class IteratedChar only because - * that would require a lot of extra derived classes, which is awkward. - * @param it The IteratedChar "struct" into which the returned char is placed. - * @param det The charset detector, which is needed to get at the input byte data - * being iterated over. - * @return True if a character was returned, false at end of input. - */ - virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; - -}; - - -/** - * Shift-JIS charset recognizer. - * - */ -class CharsetRecog_sjis : public CharsetRecog_mbcs { -public: - virtual ~CharsetRecog_sjis(); - - UBool nextChar(IteratedChar *it, InputText *det) const; - - UBool match(InputText* input, CharsetMatch *results) const; - - const char *getName() const; - const char *getLanguage() const; - -}; - - -/** - * EUC charset recognizers. One abstract class that provides the common function - * for getting the next character according to the EUC encoding scheme, - * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. - * - */ -class CharsetRecog_euc : public CharsetRecog_mbcs -{ -public: - virtual ~CharsetRecog_euc(); - - const char *getName() const = 0; - const char *getLanguage() const = 0; - - UBool match(InputText* input, CharsetMatch *results) const = 0; - /* - * (non-Javadoc) - * Get the next character value for EUC based encodings. - * Character "value" is simply the raw bytes that make up the character - * packed into an int. - */ - UBool nextChar(IteratedChar *it, InputText *det) const; -}; - -/** - * The charset recognize for EUC-JP. A singleton instance of this class - * is created and kept by the public CharsetDetector class - */ -class CharsetRecog_euc_jp : public CharsetRecog_euc -{ -public: - virtual ~CharsetRecog_euc_jp(); - - const char *getName() const; - const char *getLanguage() const; - - UBool match(InputText* input, CharsetMatch *results) const; -}; - -/** - * The charset recognize for EUC-KR. A singleton instance of this class - * is created and kept by the public CharsetDetector class - */ -class CharsetRecog_euc_kr : public CharsetRecog_euc -{ -public: - virtual ~CharsetRecog_euc_kr(); - - const char *getName() const; - const char *getLanguage() const; - - UBool match(InputText* input, CharsetMatch *results) const; -}; - -/** - * - * Big5 charset recognizer. - * - */ -class CharsetRecog_big5 : public CharsetRecog_mbcs -{ -public: - virtual ~CharsetRecog_big5(); - - UBool nextChar(IteratedChar* it, InputText* det) const; - - const char *getName() const; - const char *getLanguage() const; - - UBool match(InputText* input, CharsetMatch *results) const; -}; - - -/** - * - * GB-18030 recognizer. Uses simplified Chinese statistics. - * - */ -class CharsetRecog_gb_18030 : public CharsetRecog_mbcs -{ -public: - virtual ~CharsetRecog_gb_18030(); - - UBool nextChar(IteratedChar* it, InputText* det) const; - - const char *getName() const; - const char *getLanguage() const; - - UBool match(InputText* input, CharsetMatch *results) const; -}; - -U_NAMESPACE_END - -#endif -#endif /* __CSRMBCS_H */ +// License & terms of use: http://www.unicode.org/copyright.html +/* + ********************************************************************** + * Copyright (C) 2005-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + */ + +#ifndef __CSRMBCS_H +#define __CSRMBCS_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "csrecog.h" + +U_NAMESPACE_BEGIN + +// "Character" iterated character class. +// Recognizers for specific mbcs encodings make their "characters" available +// by providing a nextChar() function that fills in an instance of IteratedChar +// with the next char from the input. +// The returned characters are not converted to Unicode, but remain as the raw +// bytes (concatenated into an int) from the codepage data. +// +// For Asian charsets, use the raw input rather than the input that has been +// stripped of markup. Detection only considers multi-byte chars, effectively +// stripping markup anyway, and double byte chars do occur in markup too. +// +class IteratedChar : public UMemory +{ +public: + uint32_t charValue; // 1-4 bytes from the raw input data + int32_t index; + int32_t nextIndex; + UBool error; + UBool done; + +public: + IteratedChar(); + //void reset(); + int32_t nextByte(InputText* det); +}; + + +class CharsetRecog_mbcs : public CharsetRecognizer { + +protected: + /** + * Test the match of this charset with the input text data + * which is obtained via the CharsetDetector object. + * + * @param det The CharsetDetector, which contains the input text + * to be checked for being in this charset. + * @return Two values packed into one int (Damn java, anyhow) + * <br/> + * bits 0-7: the match confidence, ranging from 0-100 + * <br/> + * bits 8-15: The match reason, an enum-like value. + */ + int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; + +public: + + virtual ~CharsetRecog_mbcs(); + + /** + * Get the IANA name of this charset. + * @return the charset name. + */ + + const char *getName() const = 0; + const char *getLanguage() const = 0; + UBool match(InputText* input, CharsetMatch *results) const = 0; + + /** + * Get the next character (however many bytes it is) from the input data + * Subclasses for specific charset encodings must implement this function + * to get characters according to the rules of their encoding scheme. + * + * This function is not a method of class IteratedChar only because + * that would require a lot of extra derived classes, which is awkward. + * @param it The IteratedChar "struct" into which the returned char is placed. + * @param det The charset detector, which is needed to get at the input byte data + * being iterated over. + * @return True if a character was returned, false at end of input. + */ + virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; + +}; + + +/** + * Shift-JIS charset recognizer. + * + */ +class CharsetRecog_sjis : public CharsetRecog_mbcs { +public: + virtual ~CharsetRecog_sjis(); + + UBool nextChar(IteratedChar *it, InputText *det) const; + + UBool match(InputText* input, CharsetMatch *results) const; + + const char *getName() const; + const char *getLanguage() const; + +}; + + +/** + * EUC charset recognizers. One abstract class that provides the common function + * for getting the next character according to the EUC encoding scheme, + * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. + * + */ +class CharsetRecog_euc : public CharsetRecog_mbcs +{ +public: + virtual ~CharsetRecog_euc(); + + const char *getName() const = 0; + const char *getLanguage() const = 0; + + UBool match(InputText* input, CharsetMatch *results) const = 0; + /* + * (non-Javadoc) + * Get the next character value for EUC based encodings. + * Character "value" is simply the raw bytes that make up the character + * packed into an int. + */ + UBool nextChar(IteratedChar *it, InputText *det) const; +}; + +/** + * The charset recognize for EUC-JP. A singleton instance of this class + * is created and kept by the public CharsetDetector class + */ +class CharsetRecog_euc_jp : public CharsetRecog_euc +{ +public: + virtual ~CharsetRecog_euc_jp(); + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + +/** + * The charset recognize for EUC-KR. A singleton instance of this class + * is created and kept by the public CharsetDetector class + */ +class CharsetRecog_euc_kr : public CharsetRecog_euc +{ +public: + virtual ~CharsetRecog_euc_kr(); + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + +/** + * + * Big5 charset recognizer. + * + */ +class CharsetRecog_big5 : public CharsetRecog_mbcs +{ +public: + virtual ~CharsetRecog_big5(); + + UBool nextChar(IteratedChar* it, InputText* det) const; + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + + +/** + * + * GB-18030 recognizer. Uses simplified Chinese statistics. + * + */ +class CharsetRecog_gb_18030 : public CharsetRecog_mbcs +{ +public: + virtual ~CharsetRecog_gb_18030(); + + UBool nextChar(IteratedChar* it, InputText* det) const; + + const char *getName() const; + const char *getLanguage() const; + + UBool match(InputText* input, CharsetMatch *results) const; +}; + +U_NAMESPACE_END + +#endif +#endif /* __CSRMBCS_H */ |