aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/libmysql_r/strings/mb_wc.h
diff options
context:
space:
mode:
authorhcpp <hcpp@ydb.tech>2023-11-08 12:09:41 +0300
committerhcpp <hcpp@ydb.tech>2023-11-08 12:56:14 +0300
commita361f5b98b98b44ea510d274f6769164640dd5e1 (patch)
treec47c80962c6e2e7b06798238752fd3da0191a3f6 /contrib/libs/libmysql_r/strings/mb_wc.h
parent9478806fde1f4d40bd5a45e7cbe77237dab613e9 (diff)
downloadydb-a361f5b98b98b44ea510d274f6769164640dd5e1.tar.gz
metrics have been added
Diffstat (limited to 'contrib/libs/libmysql_r/strings/mb_wc.h')
-rw-r--r--contrib/libs/libmysql_r/strings/mb_wc.h224
1 files changed, 224 insertions, 0 deletions
diff --git a/contrib/libs/libmysql_r/strings/mb_wc.h b/contrib/libs/libmysql_r/strings/mb_wc.h
new file mode 100644
index 0000000000..4deb473c99
--- /dev/null
+++ b/contrib/libs/libmysql_r/strings/mb_wc.h
@@ -0,0 +1,224 @@
+#ifndef MB_WC_INCLUDED
+#define MB_WC_INCLUDED
+
+/* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License, version 2.0, as published by the Free Software Foundation.
+
+ This library is also distributed with certain software (including
+ but not limited to OpenSSL) that is licensed under separate terms,
+ as designated in a particular file or component or in included license
+ documentation. The authors of MySQL hereby grant you an additional
+ permission to link the library and your derivative works with the
+ separately licensed software that they have included with MySQL.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License, version 2.0, for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+ MA 02110-1301 USA */
+
+/**
+ @file mb_wc.h
+
+ Definitions of mb_wc (multibyte to wide character, ie., effectively
+ “parse a UTF-8 character”) functions for UTF-8 (both three- and four-byte).
+ These are available both as inline functions, as C-style thunks so that they
+ can fit into MY_CHARSET_HANDLER, and as functors.
+
+ The functors exist so that you can specialize a class on them and get them
+ inlined instead of having to call them through the function pointer in
+ MY_CHARSET_HANDLER; mb_wc is in itself so cheap (the most common case is
+ just a single byte load and a predictable compare) that the call overhead
+ in a tight loop is significant, and these routines tend to take up a lot
+ of CPU time when sorting. Typically, at the outermost level, you'd simply
+ compare cs->cset->mb_wc with my_mb_wc_{utf8,utf8mb4}_thunk, and if so,
+ instantiate your function with the given class. If it doesn't match,
+ you can use Mb_wc_through_function_pointer, which calls through the
+ function pointer as usual. (It will cache the function pointer for you,
+ which is typically faster than looking it up all the time -- the compiler
+ cannot always figure out on its own that it doesn't change.)
+
+ The Mb_wc_* classes should be sent by _value_, not by reference, since
+ they are never larger than two pointers (and usually simply zero).
+*/
+
+#include "m_ctype.h"
+#include "my_compiler.h"
+#include "my_config.h"
+
+template <bool RANGE_CHECK, bool SUPPORT_MB4>
+static int my_mb_wc_utf8_prototype(my_wc_t *pwc, const uchar *s,
+ const uchar *e);
+
+static int my_mb_wc_utf8(my_wc_t *pwc, const uchar *s, const uchar *e);
+static int my_mb_wc_utf8mb4(my_wc_t *pwc, const uchar *s, const uchar *e);
+
+/**
+ Functor that converts a UTF-8 multibyte sequence (up to three bytes)
+ to a wide character.
+*/
+struct Mb_wc_utf8 {
+ Mb_wc_utf8() {}
+
+ ALWAYS_INLINE
+ int operator()(my_wc_t *pwc, const uchar *s, const uchar *e) const {
+ return my_mb_wc_utf8(pwc, s, e);
+ }
+};
+
+/**
+ Functor that converts a UTF-8 multibyte sequence (up to four bytes)
+ to a wide character.
+*/
+struct Mb_wc_utf8mb4 {
+ Mb_wc_utf8mb4() {}
+
+ ALWAYS_INLINE
+ int operator()(my_wc_t *pwc, const uchar *s, const uchar *e) const {
+ return my_mb_wc_utf8mb4(pwc, s, e);
+ }
+};
+
+/**
+ Functor that uses a function pointer to convert a multibyte sequence
+ to a wide character.
+*/
+class Mb_wc_through_function_pointer {
+ public:
+ explicit Mb_wc_through_function_pointer(const CHARSET_INFO *cs)
+ : m_funcptr(cs->cset->mb_wc), m_cs(cs) {}
+
+ int operator()(my_wc_t *pwc, const uchar *s, const uchar *e) const {
+ return m_funcptr(m_cs, pwc, s, e);
+ }
+
+ private:
+ typedef int (*mbwc_func_t)(const CHARSET_INFO *, my_wc_t *, const uchar *,
+ const uchar *);
+
+ const mbwc_func_t m_funcptr;
+ const CHARSET_INFO *const m_cs;
+};
+
+template <bool RANGE_CHECK, bool SUPPORT_MB4>
+static ALWAYS_INLINE int my_mb_wc_utf8_prototype(my_wc_t *pwc, const uchar *s,
+ const uchar *e) {
+ if (RANGE_CHECK && s >= e) return MY_CS_TOOSMALL;
+
+ uchar c = s[0];
+ if (c < 0x80) {
+ *pwc = c;
+ return 1;
+ }
+
+ if (c < 0xe0) {
+ if (c < 0xc2) // Resulting code point would be less than 0x80.
+ return MY_CS_ILSEQ;
+
+ if (RANGE_CHECK && s + 2 > e) return MY_CS_TOOSMALL2;
+
+ if ((s[1] & 0xc0) != 0x80) // Next byte must be a continuation byte.
+ return MY_CS_ILSEQ;
+
+ *pwc = ((my_wc_t)(c & 0x1f) << 6) + (my_wc_t)(s[1] & 0x3f);
+ return 2;
+ }
+
+ if (c < 0xf0) {
+ if (RANGE_CHECK && s + 3 > e) return MY_CS_TOOSMALL3;
+
+ // Next two bytes must be continuation bytes.
+ uint16 two_bytes;
+ memcpy(&two_bytes, s + 1, sizeof(two_bytes));
+ if ((two_bytes & 0xc0c0) != 0x8080) // Endianness does not matter.
+ return MY_CS_ILSEQ;
+
+ *pwc = ((my_wc_t)(c & 0x0f) << 12) + ((my_wc_t)(s[1] & 0x3f) << 6) +
+ (my_wc_t)(s[2] & 0x3f);
+ if (*pwc < 0x800) return MY_CS_ILSEQ;
+ /*
+ According to RFC 3629, UTF-8 should prohibit characters between
+ U+D800 and U+DFFF, which are reserved for surrogate pairs and do
+ not directly represent characters.
+ */
+ if (*pwc >= 0xd800 && *pwc <= 0xdfff) return MY_CS_ILSEQ;
+ return 3;
+ }
+
+ if (SUPPORT_MB4) {
+ if (RANGE_CHECK && s + 4 > e) /* We need 4 characters */
+ return MY_CS_TOOSMALL4;
+
+ /*
+ This byte must be of the form 11110xxx, and the next three bytes
+ must be continuation bytes.
+ */
+ uint32 four_bytes;
+ memcpy(&four_bytes, s, sizeof(four_bytes));
+#ifdef WORDS_BIGENDIAN
+ if ((four_bytes & 0xf8c0c0c0) != 0xf0808080)
+#else
+ if ((four_bytes & 0xc0c0c0f8) != 0x808080f0)
+#endif
+ return MY_CS_ILSEQ;
+
+ *pwc = ((my_wc_t)(c & 0x07) << 18) + ((my_wc_t)(s[1] & 0x3f) << 12) +
+ ((my_wc_t)(s[2] & 0x3f) << 6) + (my_wc_t)(s[3] & 0x3f);
+ if (*pwc < 0x10000 || *pwc > 0x10ffff) return MY_CS_ILSEQ;
+ return 4;
+ }
+
+ return MY_CS_ILSEQ;
+}
+
+/**
+ Parses a single UTF-8 character from a byte string.
+
+ @param[out] pwc the parsed character, if any
+ @param s the string to read from
+ @param e the end of the string; will not read past this
+
+ @return the number of bytes read from s, or a value <= 0 for failure
+ (see m_ctype.h)
+*/
+static inline int my_mb_wc_utf8(my_wc_t *pwc, const uchar *s, const uchar *e) {
+ return my_mb_wc_utf8_prototype</*RANGE_CHECK=*/true, /*SUPPORT_MB4=*/false>(
+ pwc, s, e);
+}
+
+/**
+ Parses a single UTF-8 character from a byte string. The difference
+ between this and my_mb_wc_utf8 is that this function also can handle
+ four-byte UTF-8 characters.
+
+ @param[out] pwc the parsed character, if any
+ @param s the string to read from
+ @param e the end of the string; will not read past this
+
+ @return the number of bytes read from s, or a value <= 0 for failure
+ (see m_ctype.h)
+*/
+static ALWAYS_INLINE int my_mb_wc_utf8mb4(my_wc_t *pwc, const uchar *s,
+ const uchar *e) {
+ return my_mb_wc_utf8_prototype</*RANGE_CHECK=*/true, /*SUPPORT_MB4=*/true>(
+ pwc, s, e);
+}
+
+// Non-inlined versions of the above. These are used as function pointers
+// in MY_CHARSET_HANDLER structs, and you can compare againt them to see
+// if using the Mb_wc_utf8* functors would be appropriate.
+
+extern "C" int my_mb_wc_utf8_thunk(const CHARSET_INFO *cs, my_wc_t *pwc,
+ const uchar *s, const uchar *e);
+
+extern "C" int my_mb_wc_utf8mb4_thunk(const CHARSET_INFO *cs, my_wc_t *pwc,
+ const uchar *s, const uchar *e);
+
+#endif // MB_WC_INCLUDED