diff options
author | hcpp <hcpp@ydb.tech> | 2023-11-08 12:09:41 +0300 |
---|---|---|
committer | hcpp <hcpp@ydb.tech> | 2023-11-08 12:56:14 +0300 |
commit | a361f5b98b98b44ea510d274f6769164640dd5e1 (patch) | |
tree | c47c80962c6e2e7b06798238752fd3da0191a3f6 /contrib/libs/libmysql_r/strings/mb_wc.h | |
parent | 9478806fde1f4d40bd5a45e7cbe77237dab613e9 (diff) | |
download | ydb-a361f5b98b98b44ea510d274f6769164640dd5e1.tar.gz |
metrics have been added
Diffstat (limited to 'contrib/libs/libmysql_r/strings/mb_wc.h')
-rw-r--r-- | contrib/libs/libmysql_r/strings/mb_wc.h | 224 |
1 files changed, 224 insertions, 0 deletions
diff --git a/contrib/libs/libmysql_r/strings/mb_wc.h b/contrib/libs/libmysql_r/strings/mb_wc.h new file mode 100644 index 0000000000..4deb473c99 --- /dev/null +++ b/contrib/libs/libmysql_r/strings/mb_wc.h @@ -0,0 +1,224 @@ +#ifndef MB_WC_INCLUDED +#define MB_WC_INCLUDED + +/* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License, version 2.0, as published by the Free Software Foundation. + + This library is also distributed with certain software (including + but not limited to OpenSSL) that is licensed under separate terms, + as designated in a particular file or component or in included license + documentation. The authors of MySQL hereby grant you an additional + permission to link the library and your derivative works with the + separately licensed software that they have included with MySQL. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License, version 2.0, for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, + MA 02110-1301 USA */ + +/** + @file mb_wc.h + + Definitions of mb_wc (multibyte to wide character, ie., effectively + “parse a UTF-8 character”) functions for UTF-8 (both three- and four-byte). + These are available both as inline functions, as C-style thunks so that they + can fit into MY_CHARSET_HANDLER, and as functors. + + The functors exist so that you can specialize a class on them and get them + inlined instead of having to call them through the function pointer in + MY_CHARSET_HANDLER; mb_wc is in itself so cheap (the most common case is + just a single byte load and a predictable compare) that the call overhead + in a tight loop is significant, and these routines tend to take up a lot + of CPU time when sorting. Typically, at the outermost level, you'd simply + compare cs->cset->mb_wc with my_mb_wc_{utf8,utf8mb4}_thunk, and if so, + instantiate your function with the given class. If it doesn't match, + you can use Mb_wc_through_function_pointer, which calls through the + function pointer as usual. (It will cache the function pointer for you, + which is typically faster than looking it up all the time -- the compiler + cannot always figure out on its own that it doesn't change.) + + The Mb_wc_* classes should be sent by _value_, not by reference, since + they are never larger than two pointers (and usually simply zero). +*/ + +#include "m_ctype.h" +#include "my_compiler.h" +#include "my_config.h" + +template <bool RANGE_CHECK, bool SUPPORT_MB4> +static int my_mb_wc_utf8_prototype(my_wc_t *pwc, const uchar *s, + const uchar *e); + +static int my_mb_wc_utf8(my_wc_t *pwc, const uchar *s, const uchar *e); +static int my_mb_wc_utf8mb4(my_wc_t *pwc, const uchar *s, const uchar *e); + +/** + Functor that converts a UTF-8 multibyte sequence (up to three bytes) + to a wide character. +*/ +struct Mb_wc_utf8 { + Mb_wc_utf8() {} + + ALWAYS_INLINE + int operator()(my_wc_t *pwc, const uchar *s, const uchar *e) const { + return my_mb_wc_utf8(pwc, s, e); + } +}; + +/** + Functor that converts a UTF-8 multibyte sequence (up to four bytes) + to a wide character. +*/ +struct Mb_wc_utf8mb4 { + Mb_wc_utf8mb4() {} + + ALWAYS_INLINE + int operator()(my_wc_t *pwc, const uchar *s, const uchar *e) const { + return my_mb_wc_utf8mb4(pwc, s, e); + } +}; + +/** + Functor that uses a function pointer to convert a multibyte sequence + to a wide character. +*/ +class Mb_wc_through_function_pointer { + public: + explicit Mb_wc_through_function_pointer(const CHARSET_INFO *cs) + : m_funcptr(cs->cset->mb_wc), m_cs(cs) {} + + int operator()(my_wc_t *pwc, const uchar *s, const uchar *e) const { + return m_funcptr(m_cs, pwc, s, e); + } + + private: + typedef int (*mbwc_func_t)(const CHARSET_INFO *, my_wc_t *, const uchar *, + const uchar *); + + const mbwc_func_t m_funcptr; + const CHARSET_INFO *const m_cs; +}; + +template <bool RANGE_CHECK, bool SUPPORT_MB4> +static ALWAYS_INLINE int my_mb_wc_utf8_prototype(my_wc_t *pwc, const uchar *s, + const uchar *e) { + if (RANGE_CHECK && s >= e) return MY_CS_TOOSMALL; + + uchar c = s[0]; + if (c < 0x80) { + *pwc = c; + return 1; + } + + if (c < 0xe0) { + if (c < 0xc2) // Resulting code point would be less than 0x80. + return MY_CS_ILSEQ; + + if (RANGE_CHECK && s + 2 > e) return MY_CS_TOOSMALL2; + + if ((s[1] & 0xc0) != 0x80) // Next byte must be a continuation byte. + return MY_CS_ILSEQ; + + *pwc = ((my_wc_t)(c & 0x1f) << 6) + (my_wc_t)(s[1] & 0x3f); + return 2; + } + + if (c < 0xf0) { + if (RANGE_CHECK && s + 3 > e) return MY_CS_TOOSMALL3; + + // Next two bytes must be continuation bytes. + uint16 two_bytes; + memcpy(&two_bytes, s + 1, sizeof(two_bytes)); + if ((two_bytes & 0xc0c0) != 0x8080) // Endianness does not matter. + return MY_CS_ILSEQ; + + *pwc = ((my_wc_t)(c & 0x0f) << 12) + ((my_wc_t)(s[1] & 0x3f) << 6) + + (my_wc_t)(s[2] & 0x3f); + if (*pwc < 0x800) return MY_CS_ILSEQ; + /* + According to RFC 3629, UTF-8 should prohibit characters between + U+D800 and U+DFFF, which are reserved for surrogate pairs and do + not directly represent characters. + */ + if (*pwc >= 0xd800 && *pwc <= 0xdfff) return MY_CS_ILSEQ; + return 3; + } + + if (SUPPORT_MB4) { + if (RANGE_CHECK && s + 4 > e) /* We need 4 characters */ + return MY_CS_TOOSMALL4; + + /* + This byte must be of the form 11110xxx, and the next three bytes + must be continuation bytes. + */ + uint32 four_bytes; + memcpy(&four_bytes, s, sizeof(four_bytes)); +#ifdef WORDS_BIGENDIAN + if ((four_bytes & 0xf8c0c0c0) != 0xf0808080) +#else + if ((four_bytes & 0xc0c0c0f8) != 0x808080f0) +#endif + return MY_CS_ILSEQ; + + *pwc = ((my_wc_t)(c & 0x07) << 18) + ((my_wc_t)(s[1] & 0x3f) << 12) + + ((my_wc_t)(s[2] & 0x3f) << 6) + (my_wc_t)(s[3] & 0x3f); + if (*pwc < 0x10000 || *pwc > 0x10ffff) return MY_CS_ILSEQ; + return 4; + } + + return MY_CS_ILSEQ; +} + +/** + Parses a single UTF-8 character from a byte string. + + @param[out] pwc the parsed character, if any + @param s the string to read from + @param e the end of the string; will not read past this + + @return the number of bytes read from s, or a value <= 0 for failure + (see m_ctype.h) +*/ +static inline int my_mb_wc_utf8(my_wc_t *pwc, const uchar *s, const uchar *e) { + return my_mb_wc_utf8_prototype</*RANGE_CHECK=*/true, /*SUPPORT_MB4=*/false>( + pwc, s, e); +} + +/** + Parses a single UTF-8 character from a byte string. The difference + between this and my_mb_wc_utf8 is that this function also can handle + four-byte UTF-8 characters. + + @param[out] pwc the parsed character, if any + @param s the string to read from + @param e the end of the string; will not read past this + + @return the number of bytes read from s, or a value <= 0 for failure + (see m_ctype.h) +*/ +static ALWAYS_INLINE int my_mb_wc_utf8mb4(my_wc_t *pwc, const uchar *s, + const uchar *e) { + return my_mb_wc_utf8_prototype</*RANGE_CHECK=*/true, /*SUPPORT_MB4=*/true>( + pwc, s, e); +} + +// Non-inlined versions of the above. These are used as function pointers +// in MY_CHARSET_HANDLER structs, and you can compare againt them to see +// if using the Mb_wc_utf8* functors would be appropriate. + +extern "C" int my_mb_wc_utf8_thunk(const CHARSET_INFO *cs, my_wc_t *pwc, + const uchar *s, const uchar *e); + +extern "C" int my_mb_wc_utf8mb4_thunk(const CHARSET_INFO *cs, my_wc_t *pwc, + const uchar *s, const uchar *e); + +#endif // MB_WC_INCLUDED |