aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/libfyaml/src/lib/fy-utf8.h
diff options
context:
space:
mode:
authorDaniil Cherednik <dan.cherednik@gmail.com>2023-05-05 11:09:01 +0300
committerDaniil Cherednik <dan.cherednik@gmail.com>2023-05-05 11:09:01 +0300
commitb5a989b16cafa8a3b3bc076f1097a0eda6f48c06 (patch)
tree4da744117a5aab37758921fa43b95a3068e5aec1 /contrib/libs/libfyaml/src/lib/fy-utf8.h
parentfc1cffcfa7f0497a1f97b384a24bcbf23362f3be (diff)
downloadydb-b5a989b16cafa8a3b3bc076f1097a0eda6f48c06.tar.gz
Ydb stable 23-1-2623.1.26
x-stable-origin-commit: 22184a7e157553d447f17a2dffc4ea2d32dfd74d
Diffstat (limited to 'contrib/libs/libfyaml/src/lib/fy-utf8.h')
-rw-r--r--contrib/libs/libfyaml/src/lib/fy-utf8.h229
1 files changed, 229 insertions, 0 deletions
diff --git a/contrib/libs/libfyaml/src/lib/fy-utf8.h b/contrib/libs/libfyaml/src/lib/fy-utf8.h
new file mode 100644
index 0000000000..fa4faa1062
--- /dev/null
+++ b/contrib/libs/libfyaml/src/lib/fy-utf8.h
@@ -0,0 +1,229 @@
+/*
+ * fy-utf8.h - UTF-8 methods
+ *
+ * Copyright (c) 2019 Pantelis Antoniou <pantelis.antoniou@konsulko.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef FY_UTF8_H
+#define FY_UTF8_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <libfyaml.h>
+
+#include "fy-utils.h"
+
+extern const int8_t fy_utf8_width_table[32];
+
+static inline int
+fy_utf8_width_by_first_octet_no_table(uint8_t c)
+{
+ return (c & 0x80) == 0x00 ? 1 :
+ (c & 0xe0) == 0xc0 ? 2 :
+ (c & 0xf0) == 0xe0 ? 3 :
+ (c & 0xf8) == 0xf0 ? 4 : 0;
+}
+
+static inline FY_ALWAYS_INLINE int
+fy_utf8_width_by_first_octet(uint8_t c)
+{
+ return fy_utf8_width_table[c >> 3];
+}
+
+/* assumes valid utf8 character */
+static inline size_t
+fy_utf8_width(int c)
+{
+ return c < 0x80 ? 1 :
+ c < 0x800 ? 2 :
+ c < 0x10000 ? 3 : 4;
+}
+
+static inline bool
+fy_utf8_is_valid(int c)
+{
+ return c >= 0 && !((c >= 0xd800 && c <= 0xdfff) || c >= 0x110000);
+}
+
+static inline bool
+fy_utf8_is_printable_ascii(int c)
+{
+ return c >= 0x20 && c <= 0x7e;
+}
+
+/* generic utf8 decoder (not inlined) */
+int fy_utf8_get_generic(const void *ptr, int left, int *widthp);
+
+/* -1 for end of input, -2 for invalid character, -3 for partial */
+#define FYUG_EOF -1
+#define FYUG_INV -2
+#define FYUG_PARTIAL -3
+
+static inline int fy_utf8_get(const void *ptr, int left, int *widthp)
+{
+ const uint8_t *p = ptr;
+
+ /* single byte (hot path) */
+ if (left <= 0) {
+ *widthp = 0;
+ return FYUG_EOF;
+ }
+
+ if (!(p[0] & 0x80)) {
+ *widthp = 1;
+ return p[0] & 0x7f;
+ }
+ return fy_utf8_get_generic(ptr, left, widthp);
+}
+
+int fy_utf8_get_right_generic(const void *ptr, int left, int *widthp);
+
+static inline int fy_utf8_get_right(const void *ptr, int left, int *widthp)
+{
+ const uint8_t *p = (const uint8_t*)ptr + left;
+
+ /* single byte (hot path) */
+ if (left > 0 && !(p[-1] & 0x80)) {
+ if (widthp)
+ *widthp = 1;
+ return p[-1] & 0x7f;
+ }
+ return fy_utf8_get_right_generic(ptr, left, widthp);
+}
+
+
+/* for when you _know_ that there's enough room and c is valid */
+static inline void *fy_utf8_put_unchecked(void *ptr, int c)
+{
+ uint8_t *s = ptr;
+
+ assert(c >= 0);
+ if (c < 0x80)
+ *s++ = c;
+ else if (c < 0x800) {
+ *s++ = (c >> 6) | 0xc0;
+ *s++ = (c & 0x3f) | 0x80;
+ } else if (c < 0x10000) {
+ *s++ = (c >> 12) | 0xe0;
+ *s++ = ((c >> 6) & 0x3f) | 0x80;
+ *s++ = (c & 0x3f) | 0x80;
+ } else {
+ *s++ = (c >> 18) | 0xf0;
+ *s++ = ((c >> 12) & 0x3f) | 0x80;
+ *s++ = ((c >> 6) & 0x3f) | 0x80;
+ *s++ = (c & 0x3f) | 0x80;
+ }
+ return s;
+}
+
+static inline void *fy_utf8_put(void *ptr, size_t left, int c)
+{
+ if (!fy_utf8_is_valid(c) || fy_utf8_width(c) > left)
+ return NULL;
+
+ return fy_utf8_put_unchecked(ptr, c);
+}
+
+/* buffer must contain at least 5 characters */
+#define FY_UTF8_FORMAT_BUFMIN 5
+enum fy_utf8_escape {
+ fyue_none,
+ fyue_singlequote,
+ fyue_doublequote,
+ fyue_doublequote_json,
+ fyue_doublequote_yaml_1_1,
+};
+
+static inline bool fy_utf8_escape_is_any_doublequote(enum fy_utf8_escape esc)
+{
+ return esc >= fyue_doublequote && esc <= fyue_doublequote_yaml_1_1;
+}
+
+char *fy_utf8_format(int c, char *buf, enum fy_utf8_escape esc);
+
+#define fy_utf8_format_a(_c, _esc, _res) \
+ do { \
+ char *_buf = FY_ALLOCA(FY_UTF8_FORMAT_BUFMIN); \
+ *(_res) = fy_utf8_format((_c), _buf, _esc); \
+ } while(false)
+
+int fy_utf8_format_text_length(const char *buf, size_t len,
+ enum fy_utf8_escape esc);
+char *fy_utf8_format_text(const char *buf, size_t len,
+ char *out, size_t maxsz,
+ enum fy_utf8_escape esc);
+
+#define fy_utf8_format_text_a(_buf, _len, _esc, _res) \
+ do { \
+ const char *__buf = (_buf); \
+ size_t __len = (_len); \
+ enum fy_utf8_escape __esc = (_esc); \
+ size_t _outsz = fy_utf8_format_text_length(__buf, __len, __esc); \
+ char *_out = FY_ALLOCA(_outsz + 1); \
+ *(_res) = fy_utf8_format_text(__buf, __len, _out, _outsz, __esc); \
+ } while(false)
+
+char *fy_utf8_format_text_alloc(const char *buf, size_t len, enum fy_utf8_escape esc);
+
+const void *fy_utf8_memchr_generic(const void *s, int c, size_t n);
+
+static inline const void *fy_utf8_memchr(const void *s, int c, size_t n)
+{
+ if (c < 0 || !n)
+ return NULL;
+ if (c < 0x80)
+ return memchr(s, c, n);
+ return fy_utf8_memchr_generic(s, c, n);
+}
+
+static inline const void *fy_utf8_strchr(const void *s, int c)
+{
+ if (c < 0)
+ return NULL;
+ if (c < 0x80)
+ return strchr(s, c);
+ return fy_utf8_memchr_generic(s, c, strlen(s));
+}
+
+static inline int fy_utf8_count(const void *ptr, size_t len)
+{
+ const uint8_t *s = ptr, *e = (const uint8_t *)ptr + len;
+ int w, count;
+
+ count = 0;
+ while (s < e) {
+ w = fy_utf8_width_by_first_octet(*s);
+
+ /* malformed? */
+ if (!w || s + w > e)
+ break;
+ s += w;
+
+ count++;
+ }
+
+ return count;
+}
+
+int fy_utf8_parse_escape(const char **strp, size_t len, enum fy_utf8_escape esc);
+
+#define F_NONE 0
+#define F_NON_PRINT FY_BIT(0) /* non printable */
+#define F_SIMPLE_SCALAR FY_BIT(1) /* part of simple scalar */
+#define F_QUOTE_ESC FY_BIT(2) /* escape form, i.e \n */
+#define F_LB FY_BIT(3) /* is a linebreak */
+#define F_WS FY_BIT(4) /* is a whitespace */
+#define F_PUNCT FY_BIT(5) /* is a punctuation mark */
+#define F_LETTER FY_BIT(6) /* is a letter a..z A..Z */
+#define F_DIGIT FY_BIT(7) /* is a digit 0..9 */
+
+extern uint8_t fy_utf8_low_ascii_flags[0x80];
+
+#endif