aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/utf8proc/utf8proc.c
diff options
context:
space:
mode:
authorrobot-contrib <robot-contrib@yandex-team.ru>2022-06-19 20:39:07 +0300
committerrobot-contrib <robot-contrib@yandex-team.ru>2022-06-19 20:39:07 +0300
commitf403b8fac5305b025fbbed70b9396e28ac62df50 (patch)
tree41d356f206d4d92d934b231fcf93d32558d19783 /contrib/libs/utf8proc/utf8proc.c
parent6a3b6987f1efa14331d711f12cdcb242fcae4995 (diff)
downloadydb-f403b8fac5305b025fbbed70b9396e28ac62df50.tar.gz
Update contrib/libs/utf8proc to 2.7.0
ref:e9f684e9e9c1d13cd51d1547614f405b8a6b5620
Diffstat (limited to 'contrib/libs/utf8proc/utf8proc.c')
-rw-r--r--contrib/libs/utf8proc/utf8proc.c116
1 files changed, 77 insertions, 39 deletions
diff --git a/contrib/libs/utf8proc/utf8proc.c b/contrib/libs/utf8proc/utf8proc.c
index c0f84d9702..6752a553bf 100644
--- a/contrib/libs/utf8proc/utf8proc.c
+++ b/contrib/libs/utf8proc/utf8proc.c
@@ -1,6 +1,6 @@
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
/*
- * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
+ * Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
@@ -27,7 +27,7 @@
* Unicode data files.
*
* The original data files are available at
- * http://www.unicode.org/Public/UNIDATA/
+ * https://www.unicode.org/Public/UNIDATA/
*
* Please notice the copyright statement in the file "utf8proc_data.c".
*/
@@ -42,6 +42,14 @@
#include "utf8proc.h"
+
+#ifndef SSIZE_MAX
+#define SSIZE_MAX ((size_t)SIZE_MAX/2)
+#endif
+#ifndef UINT16_MAX
+# define UINT16_MAX 65535U
+#endif
+
#include "utf8proc_data.c"
@@ -92,6 +100,10 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
}
+UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
+ return "14.0.0";
+}
+
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
switch (errcode) {
case UTF8PROC_ERROR_NOMEM:
@@ -113,7 +125,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
) {
- utf8proc_uint32_t uc;
+ utf8proc_int32_t uc;
const utf8proc_uint8_t *end;
*dst = -1;
@@ -125,7 +137,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
return 1;
}
// Must be between 0xc2 and 0xf4 inclusive to be valid
- if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
+ if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
if (uc < 0xe0) { // 2-byte sequence
// Must have valid continuation character
if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
@@ -188,9 +200,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut
} else return 0;
}
-/* internal "unsafe" version that does not check whether uc is in range */
-static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
+/* internal version used for inserting 0xff bytes between graphemes */
+static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
if (uc < 0x00) {
+ if (uc == -1) { /* internal value used for grapheme breaks */
+ dst[0] = (utf8proc_uint8_t)0xFF;
+ return 1;
+ }
return 0;
} else if (uc < 0x80) {
dst[0] = (utf8proc_uint8_t)uc;
@@ -199,12 +215,6 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
return 2;
- } else if (uc == 0xFFFF) {
- dst[0] = (utf8proc_uint8_t)0xFF;
- return 1;
- } else if (uc == 0xFFFE) {
- dst[0] = (utf8proc_uint8_t)0xFE;
- return 1;
} else if (uc < 0x10000) {
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
@@ -271,12 +281,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
- ((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
- lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
- tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
- (lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
- (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
- tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
+ (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
+ tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
true; // GB999
@@ -284,10 +290,14 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
{
- int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
- ? *state : lbc);
- utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
if (state) {
+ int lbc_override;
+ if (*state == UTF8PROC_BOUNDCLASS_START)
+ *state = lbc_override = lbc;
+ else
+ lbc_override = *state;
+ utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
+
// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
@@ -295,16 +305,22 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
- // Special support for GB10. Fold any EXTEND codepoints into the previous
- // boundclass if we're dealing with an emoji base boundclass.
- else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
- *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
- tbc == UTF8PROC_BOUNDCLASS_EXTEND)
- *state = UTF8PROC_BOUNDCLASS_E_BASE;
+ // Special support for GB11 (emoji extend* zwj / emoji)
+ else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
+ if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
+ *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
+ else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
+ *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
+ else
+ *state = tbc;
+ }
else
*state = tbc;
+
+ return break_permitted;
}
- return break_permitted;
+ else
+ return grapheme_break_simple(lbc, tbc);
}
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
@@ -340,9 +356,9 @@ static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
utf8proc_ssize_t written = 0;
- const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
- int len = seqindex >> 13;
- if (len >= 7) {
+ const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF];
+ int len = seqindex >> 14;
+ if (len >= 3) {
len = *entry;
entry++;
}
@@ -360,19 +376,31 @@ static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqinde
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
{
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
- return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
+ return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
}
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
{
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
- return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
+ return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
}
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
{
utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
- return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
+ return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
+}
+
+UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
+{
+ const utf8proc_property_t *p = utf8proc_get_property(c);
+ return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
+}
+
+UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
+{
+ const utf8proc_property_t *p = utf8proc_get_property(c);
+ return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
}
/* return a character width analogous to wcwidth (except portable and
@@ -382,7 +410,7 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
}
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
- return utf8proc_get_property(c)->category;
+ return (utf8proc_category_t) utf8proc_get_property(c)->category;
}
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
@@ -392,7 +420,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
- options & ~UTF8PROC_LUMP, last_boundclass)
+ options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass)
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
const utf8proc_property_t *property;
@@ -423,6 +451,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
if (options & UTF8PROC_IGNORE) {
if (property->ignorable) return 0;
}
+ if (options & UTF8PROC_STRIPNA) {
+ if (!category) return 0;
+ }
if (options & UTF8PROC_LUMP) {
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
@@ -470,7 +501,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
int tbc = property->boundclass;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
if (boundary) {
- if (bufsize >= 1) dst[0] = 0xFFFF;
+ if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
if (bufsize >= 2) dst[1] = uc;
return 2;
}
@@ -676,7 +707,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
if (options & UTF8PROC_CHARBOUND) {
for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos];
- wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
+ wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
}
} else {
for (rpos = 0; rpos < length; rpos++) {
@@ -704,7 +735,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
*dstptr = NULL;
result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
if (result < 0) return result;
- buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
+ buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
if (!buffer) return UTF8PROC_ERROR_NOMEM;
result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
if (result < 0) {
@@ -752,3 +783,10 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval;
}
+
+UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
+ utf8proc_uint8_t *retval;
+ utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+ UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
+ return retval;
+}