diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/tools/python3/src/Modules/unicodedata.c | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/tools/python3/src/Modules/unicodedata.c')
-rw-r--r-- | contrib/tools/python3/src/Modules/unicodedata.c | 314 |
1 files changed, 157 insertions, 157 deletions
diff --git a/contrib/tools/python3/src/Modules/unicodedata.c b/contrib/tools/python3/src/Modules/unicodedata.c index 8a1198a2b7..847c16da72 100644 --- a/contrib/tools/python3/src/Modules/unicodedata.c +++ b/contrib/tools/python3/src/Modules/unicodedata.c @@ -17,15 +17,15 @@ #include "Python.h" #include "ucnhash.h" -#include "structmember.h" // PyMemberDef - -#include <stdbool.h> - -_Py_IDENTIFIER(NFC); -_Py_IDENTIFIER(NFD); -_Py_IDENTIFIER(NFKC); -_Py_IDENTIFIER(NFKD); - +#include "structmember.h" // PyMemberDef + +#include <stdbool.h> + +_Py_IDENTIFIER(NFC); +_Py_IDENTIFIER(NFD); +_Py_IDENTIFIER(NFKC); +_Py_IDENTIFIER(NFKD); + /*[clinic input] module unicodedata class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' @@ -92,7 +92,7 @@ static PyMemberDef DB_members[] = { /* forward declaration */ static PyTypeObject UCD_Type; -#define UCD_Check(o) Py_IS_TYPE(o, &UCD_Type) +#define UCD_Check(o) Py_IS_TYPE(o, &UCD_Type) static PyObject* new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), @@ -496,7 +496,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) Py_UCS4 *output; Py_ssize_t i, o, osize; int kind; - const void *data; + const void *data; /* Longest decomposition in Unicode 3.2: U+FDFA */ Py_UCS4 stack[20]; Py_ssize_t space, isize; @@ -623,7 +623,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) } static int -find_nfc_index(const struct reindex* nfc, Py_UCS4 code) +find_nfc_index(const struct reindex* nfc, Py_UCS4 code) { unsigned int index; for (index = 0; nfc[index].start; index++) { @@ -643,7 +643,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) { PyObject *result; int kind; - const void *data; + const void *data; Py_UCS4 *output; Py_ssize_t i, i1, o, len; int f,l,index,index1,comb; @@ -709,7 +709,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) } /* code is still input[i] here */ - f = find_nfc_index(nfc_first, code); + f = find_nfc_index(nfc_first, code); if (f == -1) { output[o++] = code; i++; @@ -732,7 +732,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) continue; } } - l = find_nfc_index(nfc_last, code1); + l = find_nfc_index(nfc_last, code1); /* i1 cannot be combined with i. If i1 is a starter, we don't need to look further. Otherwise, record the combining class. */ @@ -757,7 +757,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) assert(cskipped < 20); skipped[cskipped++] = i1; i1++; - f = find_nfc_index(nfc_first, output[o]); + f = find_nfc_index(nfc_first, output[o]); if (f == -1) break; } @@ -777,40 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) return result; } -// This needs to match the logic in makeunicodedata.py -// which constructs the quickcheck data. -typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; - -/* Run the Unicode normalization "quickcheck" algorithm. - * - * Return YES or NO if quickcheck determines the input is certainly - * normalized or certainly not, and MAYBE if quickcheck is unable to - * tell. - * - * If `yes_only` is true, then return MAYBE as soon as we determine - * the answer is not YES. - * - * For background and details on the algorithm, see UAX #15: - * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms - */ -static QuickcheckResult -is_normalized_quickcheck(PyObject *self, PyObject *input, - bool nfc, bool k, bool yes_only) +// This needs to match the logic in makeunicodedata.py +// which constructs the quickcheck data. +typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; + +/* Run the Unicode normalization "quickcheck" algorithm. + * + * Return YES or NO if quickcheck determines the input is certainly + * normalized or certainly not, and MAYBE if quickcheck is unable to + * tell. + * + * If `yes_only` is true, then return MAYBE as soon as we determine + * the answer is not YES. + * + * For background and details on the algorithm, see UAX #15: + * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + */ +static QuickcheckResult +is_normalized_quickcheck(PyObject *self, PyObject *input, + bool nfc, bool k, bool yes_only) { - /* An older version of the database is requested, quickchecks must be - disabled. */ - if (self && UCD_Check(self)) - return NO; - + /* An older version of the database is requested, quickchecks must be + disabled. */ + if (self && UCD_Check(self)) + return NO; + Py_ssize_t i, len; int kind; - const void *data; - unsigned char prev_combining = 0; + const void *data; + unsigned char prev_combining = 0; - /* The two quickcheck bits at this shift have type QuickcheckResult. */ - int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); + /* The two quickcheck bits at this shift have type QuickcheckResult. */ + int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); - QuickcheckResult result = YES; /* certainly normalized, unless we find something */ + QuickcheckResult result = YES; /* certainly normalized, unless we find something */ i = 0; kind = PyUnicode_KIND(input); @@ -819,106 +819,106 @@ is_normalized_quickcheck(PyObject *self, PyObject *input, while (i < len) { Py_UCS4 ch = PyUnicode_READ(kind, data, i++); const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); - + unsigned char combining = record->combining; if (combining && prev_combining > combining) - return NO; /* non-canonical sort order, not normalized */ + return NO; /* non-canonical sort order, not normalized */ prev_combining = combining; - - unsigned char quickcheck_whole = record->normalization_quick_check; - if (yes_only) { - if (quickcheck_whole & (3 << quickcheck_shift)) - return MAYBE; - } else { - switch ((quickcheck_whole >> quickcheck_shift) & 3) { - case NO: - return NO; - case MAYBE: - result = MAYBE; /* this string might need normalization */ - } - } + + unsigned char quickcheck_whole = record->normalization_quick_check; + if (yes_only) { + if (quickcheck_whole & (3 << quickcheck_shift)) + return MAYBE; + } else { + switch ((quickcheck_whole >> quickcheck_shift) & 3) { + case NO: + return NO; + case MAYBE: + result = MAYBE; /* this string might need normalization */ + } + } } - return result; + return result; } /*[clinic input] -unicodedata.UCD.is_normalized - - self: self - form: unicode - unistr as input: unicode - / - -Return whether the Unicode string unistr is in the normal form 'form'. - -Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. -[clinic start generated code]*/ - -static PyObject * -unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, - PyObject *input) -/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ -{ - if (PyUnicode_READY(input) == -1) { - return NULL; - } - - if (PyUnicode_GET_LENGTH(input) == 0) { - /* special case empty input strings. */ - Py_RETURN_TRUE; - } - - PyObject *result; - bool nfc = false; - bool k = false; - QuickcheckResult m; - - PyObject *cmp; - int match = 0; - - if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { - nfc = true; - } - else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { - nfc = true; - k = true; - } - else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { - /* matches default values for `nfc` and `k` */ - } - else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { - k = true; - } - else { - PyErr_SetString(PyExc_ValueError, "invalid normalization form"); - return NULL; - } - - m = is_normalized_quickcheck(self, input, nfc, k, false); - - if (m == MAYBE) { - cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); - if (cmp == NULL) { - return NULL; - } - match = PyUnicode_Compare(input, cmp); - Py_DECREF(cmp); - result = (match == 0) ? Py_True : Py_False; - } - else { - result = (m == YES) ? Py_True : Py_False; - } - - Py_INCREF(result); - return result; -} - - -/*[clinic input] +unicodedata.UCD.is_normalized + + self: self + form: unicode + unistr as input: unicode + / + +Return whether the Unicode string unistr is in the normal form 'form'. + +Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, + PyObject *input) +/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ +{ + if (PyUnicode_READY(input) == -1) { + return NULL; + } + + if (PyUnicode_GET_LENGTH(input) == 0) { + /* special case empty input strings. */ + Py_RETURN_TRUE; + } + + PyObject *result; + bool nfc = false; + bool k = false; + QuickcheckResult m; + + PyObject *cmp; + int match = 0; + + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { + nfc = true; + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { + nfc = true; + k = true; + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { + /* matches default values for `nfc` and `k` */ + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { + k = true; + } + else { + PyErr_SetString(PyExc_ValueError, "invalid normalization form"); + return NULL; + } + + m = is_normalized_quickcheck(self, input, nfc, k, false); + + if (m == MAYBE) { + cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); + if (cmp == NULL) { + return NULL; + } + match = PyUnicode_Compare(input, cmp); + Py_DECREF(cmp); + result = (match == 0) ? Py_True : Py_False; + } + else { + result = (m == YES) ? Py_True : Py_False; + } + + Py_INCREF(result); + return result; +} + + +/*[clinic input] unicodedata.UCD.normalize self: self - form: unicode + form: unicode unistr as input: unicode / @@ -928,9 +928,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, +unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, PyObject *input) -/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ +/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ { if (PyUnicode_GET_LENGTH(input) == 0) { /* Special case empty input strings, since resizing @@ -939,29 +939,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, return input; } - if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { - if (is_normalized_quickcheck(self, input, true, false, true) == YES) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { + if (is_normalized_quickcheck(self, input, true, false, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } - if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { - if (is_normalized_quickcheck(self, input, true, true, true) == YES) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { + if (is_normalized_quickcheck(self, input, true, true, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } - if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { - if (is_normalized_quickcheck(self, input, false, false, true) == YES) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { + if (is_normalized_quickcheck(self, input, false, false, true) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } - if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { - if (is_normalized_quickcheck(self, input, false, true, true) == YES) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { + if (is_normalized_quickcheck(self, input, false, true, true) == YES) { Py_INCREF(input); return input; } @@ -987,7 +987,7 @@ _gethash(const char *s, int len, int scale) unsigned long h = 0; unsigned long ix; for (i = 0; i < len; i++) { - h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]); + h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]); ix = h & 0xff000000; if (ix) h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; @@ -1031,14 +1031,14 @@ static int is_unified_ideograph(Py_UCS4 code) { return - (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ - (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */ - (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */ + (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ + (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */ + (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */ (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */ - (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */ - (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */ + (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */ + (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */ } /* macros used to determine if the given code point is in the PUA range that @@ -1057,7 +1057,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, int offset; int i; int word; - const unsigned char* w; + const unsigned char* w; if (code >= 0x110000) return 0; @@ -1158,7 +1158,7 @@ _cmpname(PyObject *self, int code, const char* name, int namelen) if (!_getucname(self, code, buffer, NAME_MAXLEN, 1)) return 0; for (i = 0; i < namelen; i++) { - if (Py_TOUPPER(name[i]) != buffer[i]) + if (Py_TOUPPER(name[i]) != buffer[i]) return 0; } return buffer[namelen] == '\0'; @@ -1379,7 +1379,7 @@ static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_DECOMPOSITION_METHODDEF UNICODEDATA_UCD_NAME_METHODDEF UNICODEDATA_UCD_LOOKUP_METHODDEF - UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF + UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF UNICODEDATA_UCD_NORMALIZE_METHODDEF {NULL, NULL} /* sentinel */ }; @@ -1393,10 +1393,10 @@ static PyTypeObject UCD_Type = { 0, /*tp_itemsize*/ /* methods */ (destructor)PyObject_Del, /*tp_dealloc*/ - 0, /*tp_vectorcall_offset*/ + 0, /*tp_vectorcall_offset*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ - 0, /*tp_as_async*/ + 0, /*tp_as_async*/ 0, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_sequence*/ @@ -1456,7 +1456,7 @@ PyInit_unicodedata(void) { PyObject *m, *v; - Py_SET_TYPE(&UCD_Type, &PyType_Type); + Py_SET_TYPE(&UCD_Type, &PyType_Type); m = PyModule_Create(&unicodedatamodule); if (!m) |