1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
/*
* fy-utf8.h - UTF-8 methods
*
* Copyright (c) 2019 Pantelis Antoniou <pantelis.antoniou@konsulko.com>
*
* SPDX-License-Identifier: MIT
*/
#ifndef FY_UTF8_H
#define FY_UTF8_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <libfyaml.h>
#include "fy-utils.h"
extern const int8_t fy_utf8_width_table[32];
static inline int
fy_utf8_width_by_first_octet_no_table(uint8_t c)
{
return (c & 0x80) == 0x00 ? 1 :
(c & 0xe0) == 0xc0 ? 2 :
(c & 0xf0) == 0xe0 ? 3 :
(c & 0xf8) == 0xf0 ? 4 : 0;
}
static inline FY_ALWAYS_INLINE int
fy_utf8_width_by_first_octet(uint8_t c)
{
return fy_utf8_width_table[c >> 3];
}
/* assumes valid utf8 character */
static inline size_t
fy_utf8_width(int c)
{
return c < 0x80 ? 1 :
c < 0x800 ? 2 :
c < 0x10000 ? 3 : 4;
}
static inline bool
fy_utf8_is_valid(int c)
{
return c >= 0 && !((c >= 0xd800 && c <= 0xdfff) || c >= 0x110000);
}
static inline bool
fy_utf8_is_printable_ascii(int c)
{
return c >= 0x20 && c <= 0x7e;
}
/* generic utf8 decoder (not inlined) */
int fy_utf8_get_generic(const void *ptr, int left, int *widthp);
/* -1 for end of input, -2 for invalid character, -3 for partial */
#define FYUG_EOF -1
#define FYUG_INV -2
#define FYUG_PARTIAL -3
static inline int fy_utf8_get(const void *ptr, int left, int *widthp)
{
const uint8_t *p = ptr;
/* single byte (hot path) */
if (left <= 0) {
*widthp = 0;
return FYUG_EOF;
}
if (!(p[0] & 0x80)) {
*widthp = 1;
return p[0] & 0x7f;
}
return fy_utf8_get_generic(ptr, left, widthp);
}
int fy_utf8_get_right_generic(const void *ptr, int left, int *widthp);
static inline int fy_utf8_get_right(const void *ptr, int left, int *widthp)
{
const uint8_t *p = (const uint8_t*)ptr + left;
/* single byte (hot path) */
if (left > 0 && !(p[-1] & 0x80)) {
if (widthp)
*widthp = 1;
return p[-1] & 0x7f;
}
return fy_utf8_get_right_generic(ptr, left, widthp);
}
/* for when you _know_ that there's enough room and c is valid */
static inline void *fy_utf8_put_unchecked(void *ptr, int c)
{
uint8_t *s = ptr;
assert(c >= 0);
if (c < 0x80)
*s++ = c;
else if (c < 0x800) {
*s++ = (c >> 6) | 0xc0;
*s++ = (c & 0x3f) | 0x80;
} else if (c < 0x10000) {
*s++ = (c >> 12) | 0xe0;
*s++ = ((c >> 6) & 0x3f) | 0x80;
*s++ = (c & 0x3f) | 0x80;
} else {
*s++ = (c >> 18) | 0xf0;
*s++ = ((c >> 12) & 0x3f) | 0x80;
*s++ = ((c >> 6) & 0x3f) | 0x80;
*s++ = (c & 0x3f) | 0x80;
}
return s;
}
static inline void *fy_utf8_put(void *ptr, size_t left, int c)
{
if (!fy_utf8_is_valid(c) || fy_utf8_width(c) > left)
return NULL;
return fy_utf8_put_unchecked(ptr, c);
}
/* buffer must contain at least 5 characters */
#define FY_UTF8_FORMAT_BUFMIN 5
enum fy_utf8_escape {
fyue_none,
fyue_singlequote,
fyue_doublequote,
fyue_doublequote_json,
fyue_doublequote_yaml_1_1,
};
static inline bool fy_utf8_escape_is_any_doublequote(enum fy_utf8_escape esc)
{
return esc >= fyue_doublequote && esc <= fyue_doublequote_yaml_1_1;
}
char *fy_utf8_format(int c, char *buf, enum fy_utf8_escape esc);
#define fy_utf8_format_a(_c, _esc, _res) \
do { \
char *_buf = FY_ALLOCA(FY_UTF8_FORMAT_BUFMIN); \
*(_res) = fy_utf8_format((_c), _buf, _esc); \
} while(false)
int fy_utf8_format_text_length(const char *buf, size_t len,
enum fy_utf8_escape esc);
char *fy_utf8_format_text(const char *buf, size_t len,
char *out, size_t maxsz,
enum fy_utf8_escape esc);
#define fy_utf8_format_text_a(_buf, _len, _esc, _res) \
do { \
const char *__buf = (_buf); \
size_t __len = (_len); \
enum fy_utf8_escape __esc = (_esc); \
size_t _outsz = fy_utf8_format_text_length(__buf, __len, __esc); \
char *_out = FY_ALLOCA(_outsz + 1); \
*(_res) = fy_utf8_format_text(__buf, __len, _out, _outsz, __esc); \
} while(false)
char *fy_utf8_format_text_alloc(const char *buf, size_t len, enum fy_utf8_escape esc);
const void *fy_utf8_memchr_generic(const void *s, int c, size_t n);
static inline const void *fy_utf8_memchr(const void *s, int c, size_t n)
{
if (c < 0 || !n)
return NULL;
if (c < 0x80)
return memchr(s, c, n);
return fy_utf8_memchr_generic(s, c, n);
}
static inline const void *fy_utf8_strchr(const void *s, int c)
{
if (c < 0)
return NULL;
if (c < 0x80)
return strchr(s, c);
return fy_utf8_memchr_generic(s, c, strlen(s));
}
static inline int fy_utf8_count(const void *ptr, size_t len)
{
const uint8_t *s = ptr, *e = (const uint8_t *)ptr + len;
int w, count;
count = 0;
while (s < e) {
w = fy_utf8_width_by_first_octet(*s);
/* malformed? */
if (!w || s + w > e)
break;
s += w;
count++;
}
return count;
}
int fy_utf8_parse_escape(const char **strp, size_t len, enum fy_utf8_escape esc);
#define F_NONE 0
#define F_NON_PRINT FY_BIT(0) /* non printable */
#define F_SIMPLE_SCALAR FY_BIT(1) /* part of simple scalar */
#define F_QUOTE_ESC FY_BIT(2) /* escape form, i.e \n */
#define F_LB FY_BIT(3) /* is a linebreak */
#define F_WS FY_BIT(4) /* is a whitespace */
#define F_PUNCT FY_BIT(5) /* is a punctuation mark */
#define F_LETTER FY_BIT(6) /* is a letter a..z A..Z */
#define F_DIGIT FY_BIT(7) /* is a digit 0..9 */
extern uint8_t fy_utf8_low_ascii_flags[0x80];
#endif
|