From 31f2a419764a8ba77c2a970cfc80056c6cd06756 Mon Sep 17 00:00:00 2001 From: shadchin Date: Mon, 12 Feb 2024 07:53:52 +0300 Subject: Update Python from 3.11.8 to 3.12.2 --- contrib/tools/python3/src/Objects/unicodeobject.c | 2204 +++++++-------------- 1 file changed, 706 insertions(+), 1498 deletions(-) (limited to 'contrib/tools/python3/src/Objects/unicodeobject.c') diff --git a/contrib/tools/python3/src/Objects/unicodeobject.c b/contrib/tools/python3/src/Objects/unicodeobject.c index b5510fc1c85..f6787c8f8aa 100644 --- a/contrib/tools/python3/src/Objects/unicodeobject.c +++ b/contrib/tools/python3/src/Objects/unicodeobject.c @@ -54,7 +54,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI #include "pycore_unicodeobject.h" // struct _Py_unicode_state +#include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings() #include "stringlib/eq.h" // unicode_eq() +#include // ptrdiff_t #ifdef MS_WINDOWS #include @@ -115,7 +117,6 @@ extern "C" { (_PyCompactUnicodeObject_CAST(op)->utf8) #define PyUnicode_UTF8(op) \ (assert(_PyUnicode_CHECK(op)), \ - assert(PyUnicode_IS_READY(op)), \ PyUnicode_IS_COMPACT_ASCII(op) ? \ ((char*)(_PyASCIIObject_CAST(op) + 1)) : \ _PyUnicode_UTF8(op)) @@ -123,21 +124,10 @@ extern "C" { (_PyCompactUnicodeObject_CAST(op)->utf8_length) #define PyUnicode_UTF8_LENGTH(op) \ (assert(_PyUnicode_CHECK(op)), \ - assert(PyUnicode_IS_READY(op)), \ PyUnicode_IS_COMPACT_ASCII(op) ? \ _PyASCIIObject_CAST(op)->length : \ _PyUnicode_UTF8_LENGTH(op)) -#define _PyUnicode_WSTR(op) \ - (_PyASCIIObject_CAST(op)->wstr) - -/* Don't use deprecated macro of unicodeobject.h */ -#undef PyUnicode_WSTR_LENGTH -#define PyUnicode_WSTR_LENGTH(op) \ - (PyUnicode_IS_COMPACT_ASCII(op) ? \ - _PyASCIIObject_CAST(op)->length : \ - _PyCompactUnicodeObject_CAST(op)->wstr_length) -#define _PyUnicode_WSTR_LENGTH(op) \ - (_PyCompactUnicodeObject_CAST(op)->wstr_length) + #define _PyUnicode_LENGTH(op) \ (_PyASCIIObject_CAST(op)->length) #define _PyUnicode_STATE(op) \ @@ -153,20 +143,10 @@ extern "C" { #define _PyUnicode_DATA_ANY(op) \ (_PyUnicodeObject_CAST(op)->data.any) -#undef PyUnicode_READY -#define PyUnicode_READY(op) \ - (assert(_PyUnicode_CHECK(op)), \ - (PyUnicode_IS_READY(op) ? \ - 0 : \ - _PyUnicode_Ready(op))) - #define _PyUnicode_SHARE_UTF8(op) \ (assert(_PyUnicode_CHECK(op)), \ assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) -#define _PyUnicode_SHARE_WSTR(op) \ - (assert(_PyUnicode_CHECK(op)), \ - (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) /* true if the Unicode object has an allocated UTF-8 memory block (not shared with other data) */ @@ -175,13 +155,6 @@ extern "C" { && _PyUnicode_UTF8(op) \ && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) -/* true if the Unicode object has an allocated wstr memory block - (not shared with other data) */ -#define _PyUnicode_HAS_WSTR_MEMORY(op) \ - ((_PyUnicode_WSTR(op) && \ - (!PyUnicode_IS_READY(op) || \ - _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) - /* Generic helper macro to convert characters of different types. from_type and to_type have to be valid type names, begin and end are pointers to the source characters which should be of type @@ -219,16 +192,6 @@ extern "C" { # define OVERALLOCATE_FACTOR 4 #endif -/* This dictionary holds all interned unicode strings. Note that references - to strings in this dictionary are *not* counted in the string's ob_refcnt. - When the interned string reaches a refcnt of 0 the string deallocation - function will delete the reference from this dictionary. - - Another way to look at this is that to say that the actual reference - count of a string is: s->ob_refcnt + (s->state ? 2 : 0) -*/ -static PyObject *interned = NULL; - /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); @@ -259,8 +222,89 @@ static inline PyObject* unicode_get_empty(void) static inline PyObject* unicode_new_empty(void) { PyObject *empty = unicode_get_empty(); - Py_INCREF(empty); - return empty; + return Py_NewRef(empty); +} + +/* This dictionary holds all interned unicode strings. Note that references + to strings in this dictionary are *not* counted in the string's ob_refcnt. + When the interned string reaches a refcnt of 0 the string deallocation + function will delete the reference from this dictionary. +*/ +static inline PyObject *get_interned_dict(PyInterpreterState *interp) +{ + return _Py_INTERP_CACHED_OBJECT(interp, interned_strings); +} + +#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings + +Py_ssize_t +_PyUnicode_InternedSize(void) +{ + PyObject *dict = get_interned_dict(_PyInterpreterState_GET()); + return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict); +} + +static Py_hash_t unicode_hash(PyObject *); +static int unicode_compare_eq(PyObject *, PyObject *); + +static Py_uhash_t +hashtable_unicode_hash(const void *key) +{ + return unicode_hash((PyObject *)key); +} + +static int +hashtable_unicode_compare(const void *key1, const void *key2) +{ + PyObject *obj1 = (PyObject *)key1; + PyObject *obj2 = (PyObject *)key2; + if (obj1 != NULL && obj2 != NULL) { + return unicode_compare_eq(obj1, obj2); + } + else { + return obj1 == obj2; + } +} + +static int +init_interned_dict(PyInterpreterState *interp) +{ + if (_Py_IsMainInterpreter(interp)) { + assert(INTERNED_STRINGS == NULL); + _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree}; + INTERNED_STRINGS = _Py_hashtable_new_full( + hashtable_unicode_hash, + hashtable_unicode_compare, + NULL, + NULL, + &hashtable_alloc + ); + if (INTERNED_STRINGS == NULL) { + return -1; + } + } + assert(get_interned_dict(interp) == NULL); + PyObject *interned = interned = PyDict_New(); + if (interned == NULL) { + return -1; + } + _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned; + return 0; +} + +static void +clear_interned_dict(PyInterpreterState *interp) +{ + PyObject *interned = get_interned_dict(interp); + if (interned != NULL) { + PyDict_Clear(interned); + Py_DECREF(interned); + _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL; + } + if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) { + _Py_hashtable_destroy(INTERNED_STRINGS); + INTERNED_STRINGS = NULL; + } } #define _Py_RETURN_UNICODE_EMPTY() \ @@ -269,11 +313,10 @@ static inline PyObject* unicode_new_empty(void) } while (0) static inline void -unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, +unicode_fill(int kind, void *data, Py_UCS4 value, Py_ssize_t start, Py_ssize_t length) { assert(0 <= start); - assert(kind != PyUnicode_WCHAR_KIND); switch (kind) { case PyUnicode_1BYTE_KIND: { assert(value <= 0xff); @@ -335,7 +378,6 @@ const unsigned char _Py_ascii_whitespace[] = { }; /* forward */ -static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); static PyObject* get_latin1_char(unsigned char ch); static int unicode_modifiable(PyObject *unicode); @@ -478,7 +520,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors) return 0; } - if (encoding != NULL) { + if (encoding != NULL + // Fast path for the most common built-in encodings. Even if the codec + // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to + // create a temporary Unicode string (the key in the cache). + && strcmp(encoding, "utf-8") != 0 + && strcmp(encoding, "utf8") != 0 + && strcmp(encoding, "ascii") != 0) + { PyObject *handler = _PyCodec_Lookup(encoding); if (handler == NULL) { return -1; @@ -486,7 +535,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors) Py_DECREF(handler); } - if (errors != NULL) { + if (errors != NULL + // Fast path for the most common built-in error handlers. + && strcmp(errors, "strict") != 0 + && strcmp(errors, "ignore") != 0 + && strcmp(errors, "replace") != 0 + && strcmp(errors, "surrogateescape") != 0 + && strcmp(errors, "surrogatepass") != 0) + { PyObject *handler = PyCodec_LookupError(errors); if (handler == NULL) { return -1; @@ -507,11 +563,10 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) CHECK(PyUnicode_Check(op)); PyASCIIObject *ascii = _PyASCIIObject_CAST(op); - unsigned int kind = ascii->state.kind; + int kind = ascii->state.kind; if (ascii->state.ascii == 1 && ascii->state.compact == 1) { CHECK(kind == PyUnicode_1BYTE_KIND); - CHECK(ascii->state.ready == 1); } else { PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op); @@ -523,62 +578,32 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) || kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_4BYTE_KIND); CHECK(ascii->state.ascii == 0); - CHECK(ascii->state.ready == 1); CHECK(compact->utf8 != data); } else { PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); data = unicode->data.any; - if (kind == PyUnicode_WCHAR_KIND) { - CHECK(ascii->length == 0); - CHECK(ascii->hash == -1); - CHECK(ascii->state.compact == 0); - CHECK(ascii->state.ascii == 0); - CHECK(ascii->state.ready == 0); - CHECK(ascii->state.interned == SSTATE_NOT_INTERNED); - CHECK(ascii->wstr != NULL); - CHECK(data == NULL); - CHECK(compact->utf8 == NULL); + CHECK(kind == PyUnicode_1BYTE_KIND + || kind == PyUnicode_2BYTE_KIND + || kind == PyUnicode_4BYTE_KIND); + CHECK(ascii->state.compact == 0); + CHECK(data != NULL); + if (ascii->state.ascii) { + CHECK(compact->utf8 == data); + CHECK(compact->utf8_length == ascii->length); } else { - CHECK(kind == PyUnicode_1BYTE_KIND - || kind == PyUnicode_2BYTE_KIND - || kind == PyUnicode_4BYTE_KIND); - CHECK(ascii->state.compact == 0); - CHECK(ascii->state.ready == 1); - CHECK(data != NULL); - if (ascii->state.ascii) { - CHECK(compact->utf8 == data); - CHECK(compact->utf8_length == ascii->length); - } - else - CHECK(compact->utf8 != data); + CHECK(compact->utf8 != data); } } - if (kind != PyUnicode_WCHAR_KIND) { - if ( -#if SIZEOF_WCHAR_T == 2 - kind == PyUnicode_2BYTE_KIND -#else - kind == PyUnicode_4BYTE_KIND -#endif - ) - { - CHECK(ascii->wstr == data); - CHECK(compact->wstr_length == ascii->length); - } else - CHECK(ascii->wstr != data); - } if (compact->utf8 == NULL) CHECK(compact->utf8_length == 0); - if (ascii->wstr == NULL) - CHECK(compact->wstr_length == 0); } /* check that the best kind is used: O(n) operation */ - if (check_content && kind != PyUnicode_WCHAR_KIND) { + if (check_content) { Py_ssize_t i; Py_UCS4 maxchar = 0; const void *data; @@ -614,47 +639,12 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) #undef CHECK } - static PyObject* -unicode_result_wchar(PyObject *unicode) -{ -#ifndef Py_DEBUG - Py_ssize_t len; - - len = _PyUnicode_WSTR_LENGTH(unicode); - if (len == 0) { - Py_DECREF(unicode); - _Py_RETURN_UNICODE_EMPTY(); - } - - if (len == 1) { - wchar_t ch = _PyUnicode_WSTR(unicode)[0]; - if ((Py_UCS4)ch < 256) { - Py_DECREF(unicode); - return get_latin1_char((unsigned char)ch); - } - } - - if (_PyUnicode_Ready(unicode) < 0) { - Py_DECREF(unicode); - return NULL; - } -#else - assert(Py_REFCNT(unicode) == 1); - - /* don't make the result ready in debug mode to ensure that the caller - makes the string ready before using it */ - assert(_PyUnicode_CheckConsistency(unicode, 1)); -#endif - return unicode; -} - -static PyObject* -unicode_result_ready(PyObject *unicode) +unicode_result(PyObject *unicode) { - Py_ssize_t length; + assert(_PyUnicode_CHECK(unicode)); - length = PyUnicode_GET_LENGTH(unicode); + Py_ssize_t length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { PyObject *empty = unicode_get_empty(); if (unicode != empty) { @@ -682,24 +672,11 @@ unicode_result_ready(PyObject *unicode) return unicode; } -static PyObject* -unicode_result(PyObject *unicode) -{ - assert(_PyUnicode_CHECK(unicode)); - if (PyUnicode_IS_READY(unicode)) - return unicode_result_ready(unicode); - else - return unicode_result_wchar(unicode); -} - static PyObject* unicode_result_unchanged(PyObject *unicode) { if (PyUnicode_CheckExact(unicode)) { - if (PyUnicode_READY(unicode) == -1) - return NULL; - Py_INCREF(unicode); - return unicode; + return Py_NewRef(unicode); } else /* Subtype -- return genuine unicode string with the same value. */ @@ -714,10 +691,9 @@ backslashreplace(_PyBytesWriter *writer, char *str, { Py_ssize_t size, i; Py_UCS4 ch; - enum PyUnicode_Kind kind; + int kind; const void *data; - assert(PyUnicode_IS_READY(unicode)); kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); @@ -781,10 +757,9 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str, { Py_ssize_t size, i; Py_UCS4 ch; - enum PyUnicode_Kind kind; + int kind; const void *data; - assert(PyUnicode_IS_READY(unicode)); kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); @@ -908,7 +883,7 @@ ensure_unicode(PyObject *obj) Py_TYPE(obj)->tp_name); return -1; } - return PyUnicode_READY(obj); + return 0; } /* Compilation of templated routines */ @@ -954,15 +929,6 @@ ensure_unicode(PyObject *obj) #include "stringlib/find_max_char.h" #include "stringlib/undef.h" -_Py_COMP_DIAG_PUSH -_Py_COMP_DIAG_IGNORE_DEPR_DECLS -#include "stringlib/unicodedefs.h" -#include "stringlib/fastsearch.h" -#include "stringlib/count.h" -#include "stringlib/find.h" -#include "stringlib/undef.h" -_Py_COMP_DIAG_POP - #undef STRINGLIB_GET_EMPTY /* --- Unicode Object ----------------------------------------------------- */ @@ -1022,14 +988,12 @@ resize_compact(PyObject *unicode, Py_ssize_t length) Py_ssize_t char_size; Py_ssize_t struct_size; Py_ssize_t new_size; - int share_wstr; PyObject *new_unicode; #ifdef Py_DEBUG Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); #endif assert(unicode_modifiable(unicode)); - assert(PyUnicode_IS_READY(unicode)); assert(PyUnicode_IS_COMPACT(unicode)); char_size = PyUnicode_KIND(unicode); @@ -1037,7 +1001,6 @@ resize_compact(PyObject *unicode, Py_ssize_t length) struct_size = sizeof(PyASCIIObject); else struct_size = sizeof(PyCompactUnicodeObject); - share_wstr = _PyUnicode_SHARE_WSTR(unicode); if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { PyErr_NoMemory(); @@ -1050,34 +1013,20 @@ resize_compact(PyObject *unicode, Py_ssize_t length) _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } -#ifdef Py_REF_DEBUG - _Py_RefTotal--; -#endif #ifdef Py_TRACE_REFS _Py_ForgetReference(unicode); #endif new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size); if (new_unicode == NULL) { - _Py_NewReference(unicode); + _Py_NewReferenceNoTotal(unicode); PyErr_NoMemory(); return NULL; } unicode = new_unicode; - _Py_NewReference(unicode); + _Py_NewReferenceNoTotal(unicode); _PyUnicode_LENGTH(unicode) = length; - if (share_wstr) { - _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); - if (!PyUnicode_IS_ASCII(unicode)) - _PyUnicode_WSTR_LENGTH(unicode) = length; - } - else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { - PyObject_Free(_PyUnicode_WSTR(unicode)); - _PyUnicode_WSTR(unicode) = NULL; - if (!PyUnicode_IS_ASCII(unicode)) - _PyUnicode_WSTR_LENGTH(unicode) = 0; - } #ifdef Py_DEBUG unicode_fill_invalid(unicode, old_length); #endif @@ -1090,78 +1039,55 @@ resize_compact(PyObject *unicode, Py_ssize_t length) static int resize_inplace(PyObject *unicode, Py_ssize_t length) { - wchar_t *wstr; - Py_ssize_t new_size; assert(!PyUnicode_IS_COMPACT(unicode)); assert(Py_REFCNT(unicode) == 1); - if (PyUnicode_IS_READY(unicode)) { - Py_ssize_t char_size; - int share_wstr, share_utf8; - void *data; + Py_ssize_t new_size; + Py_ssize_t char_size; + int share_utf8; + void *data; #ifdef Py_DEBUG - Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); + Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); #endif - data = _PyUnicode_DATA_ANY(unicode); - char_size = PyUnicode_KIND(unicode); - share_wstr = _PyUnicode_SHARE_WSTR(unicode); - share_utf8 = _PyUnicode_SHARE_UTF8(unicode); + data = _PyUnicode_DATA_ANY(unicode); + char_size = PyUnicode_KIND(unicode); + share_utf8 = _PyUnicode_SHARE_UTF8(unicode); - if (length > (PY_SSIZE_T_MAX / char_size - 1)) { - PyErr_NoMemory(); - return -1; - } - new_size = (length + 1) * char_size; + if (length > (PY_SSIZE_T_MAX / char_size - 1)) { + PyErr_NoMemory(); + return -1; + } + new_size = (length + 1) * char_size; - if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) - { - PyObject_Free(_PyUnicode_UTF8(unicode)); - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; - } + if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) + { + PyObject_Free(_PyUnicode_UTF8(unicode)); + _PyUnicode_UTF8(unicode) = NULL; + _PyUnicode_UTF8_LENGTH(unicode) = 0; + } - data = (PyObject *)PyObject_Realloc(data, new_size); - if (data == NULL) { - PyErr_NoMemory(); - return -1; - } - _PyUnicode_DATA_ANY(unicode) = data; - if (share_wstr) { - _PyUnicode_WSTR(unicode) = data; - _PyUnicode_WSTR_LENGTH(unicode) = length; - } - if (share_utf8) { - _PyUnicode_UTF8(unicode) = data; - _PyUnicode_UTF8_LENGTH(unicode) = length; - } - _PyUnicode_LENGTH(unicode) = length; - PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); + data = (PyObject *)PyObject_Realloc(data, new_size); + if (data == NULL) { + PyErr_NoMemory(); + return -1; + } + _PyUnicode_DATA_ANY(unicode) = data; + if (share_utf8) { + _PyUnicode_UTF8(unicode) = data; + _PyUnicode_UTF8_LENGTH(unicode) = length; + } + _PyUnicode_LENGTH(unicode) = length; + PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); #ifdef Py_DEBUG - unicode_fill_invalid(unicode, old_length); + unicode_fill_invalid(unicode, old_length); #endif - if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { - assert(_PyUnicode_CheckConsistency(unicode, 0)); - return 0; - } - } - assert(_PyUnicode_WSTR(unicode) != NULL); /* check for integer overflow */ if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { PyErr_NoMemory(); return -1; } - new_size = sizeof(wchar_t) * (length + 1); - wstr = _PyUnicode_WSTR(unicode); - wstr = PyObject_Realloc(wstr, new_size); - if (!wstr) { - PyErr_NoMemory(); - return -1; - } - _PyUnicode_WSTR(unicode) = wstr; - _PyUnicode_WSTR(unicode)[length] = 0; - _PyUnicode_WSTR_LENGTH(unicode) = length; assert(_PyUnicode_CheckConsistency(unicode, 0)); return 0; } @@ -1170,99 +1096,15 @@ static PyObject* resize_copy(PyObject *unicode, Py_ssize_t length) { Py_ssize_t copy_length; - if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { - PyObject *copy; - - assert(PyUnicode_IS_READY(unicode)); - - copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); - if (copy == NULL) - return NULL; - - copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); - _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); - return copy; - } - else { - PyObject *w; - - w = (PyObject*)_PyUnicode_New(length); - if (w == NULL) - return NULL; - copy_length = _PyUnicode_WSTR_LENGTH(unicode); - copy_length = Py_MIN(copy_length, length); - memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), - copy_length * sizeof(wchar_t)); - return w; - } -} - -/* We allocate one more byte to make sure the string is - Ux0000 terminated; some code (e.g. new_identifier) - relies on that. - - XXX This allocator could further be enhanced by assuring that the - free list never reduces its size below 1. - -*/ - -static PyUnicodeObject * -_PyUnicode_New(Py_ssize_t length) -{ - PyUnicodeObject *unicode; - size_t new_size; - - /* Optimization for empty strings */ - if (length == 0) { - return (PyUnicodeObject *)unicode_new_empty(); - } - - /* Ensure we won't overflow the size. */ - if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { - return (PyUnicodeObject *)PyErr_NoMemory(); - } - if (length < 0) { - PyErr_SetString(PyExc_SystemError, - "Negative size passed to _PyUnicode_New"); - return NULL; - } - - unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); - if (unicode == NULL) - return NULL; - new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); + PyObject *copy; - _PyUnicode_WSTR_LENGTH(unicode) = length; - _PyUnicode_HASH(unicode) = -1; - _PyUnicode_STATE(unicode).interned = 0; - _PyUnicode_STATE(unicode).kind = 0; - _PyUnicode_STATE(unicode).compact = 0; - _PyUnicode_STATE(unicode).ready = 0; - _PyUnicode_STATE(unicode).ascii = 0; - _PyUnicode_DATA_ANY(unicode) = NULL; - _PyUnicode_LENGTH(unicode) = 0; - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; - - _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size); - if (!_PyUnicode_WSTR(unicode)) { - Py_DECREF(unicode); - PyErr_NoMemory(); + copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); + if (copy == NULL) return NULL; - } - - /* Initialize the first element to guard against cases where - * the caller fails before initializing str -- unicode_resize() - * reads str[0], and the Keep-Alive optimization can keep memory - * allocated for str alive across a call to unicode_dealloc(unicode). - * We don't want unicode_resize to read uninitialized memory in - * that case. - */ - _PyUnicode_WSTR(unicode)[0] = 0; - _PyUnicode_WSTR(unicode)[length] = 0; - assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); - return unicode; + copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); + _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); + return copy; } static const char* @@ -1272,8 +1114,6 @@ unicode_kind_name(PyObject *unicode) _PyUnicode_Dump() */ if (!PyUnicode_IS_COMPACT(unicode)) { - if (!PyUnicode_IS_READY(unicode)) - return "wstr"; switch (PyUnicode_KIND(unicode)) { case PyUnicode_1BYTE_KIND: @@ -1289,7 +1129,6 @@ unicode_kind_name(PyObject *unicode) return ""; } } - assert(PyUnicode_IS_READY(unicode)); switch (PyUnicode_KIND(unicode)) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(unicode)) @@ -1346,15 +1185,7 @@ _PyUnicode_Dump(PyObject *op) data = unicode->data.any; printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length); - if (ascii->wstr == data) - printf("shared "); - printf("wstr=%p", (void *)ascii->wstr); - - if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { - printf(" (%zu), ", compact->wstr_length); - if (!ascii->state.compact && compact->utf8 == unicode->data.any) { - printf("shared "); - } + if (!ascii->state.ascii) { printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length); } printf(", data=%p\n", data); @@ -1373,13 +1204,12 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) PyObject *obj; PyCompactUnicodeObject *unicode; void *data; - enum PyUnicode_Kind kind; - int is_sharing, is_ascii; + int kind; + int is_ascii; Py_ssize_t char_size; Py_ssize_t struct_size; is_ascii = 0; - is_sharing = 0; struct_size = sizeof(PyCompactUnicodeObject); if (maxchar < 128) { kind = PyUnicode_1BYTE_KIND; @@ -1394,8 +1224,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) else if (maxchar < 65536) { kind = PyUnicode_2BYTE_KIND; char_size = 2; - if (sizeof(wchar_t) == 2) - is_sharing = 1; } else { if (maxchar > MAX_UNICODE) { @@ -1405,8 +1233,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) } kind = PyUnicode_4BYTE_KIND; char_size = 4; - if (sizeof(wchar_t) == 4) - is_sharing = 1; } /* Ensure we won't overflow the size. */ @@ -1438,16 +1264,13 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_STATE(unicode).interned = 0; _PyUnicode_STATE(unicode).kind = kind; _PyUnicode_STATE(unicode).compact = 1; - _PyUnicode_STATE(unicode).ready = 1; _PyUnicode_STATE(unicode).ascii = is_ascii; + _PyUnicode_STATE(unicode).statically_allocated = 0; if (is_ascii) { ((char*)data)[size] = 0; - _PyUnicode_WSTR(unicode) = NULL; } else if (kind == PyUnicode_1BYTE_KIND) { ((char*)data)[size] = 0; - _PyUnicode_WSTR(unicode) = NULL; - _PyUnicode_WSTR_LENGTH(unicode) = 0; unicode->utf8 = NULL; unicode->utf8_length = 0; } @@ -1458,14 +1281,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) ((Py_UCS2*)data)[size] = 0; else /* kind == PyUnicode_4BYTE_KIND */ ((Py_UCS4*)data)[size] = 0; - if (is_sharing) { - _PyUnicode_WSTR_LENGTH(unicode) = size; - _PyUnicode_WSTR(unicode) = (wchar_t *)data; - } - else { - _PyUnicode_WSTR_LENGTH(unicode) = 0; - _PyUnicode_WSTR(unicode) = NULL; - } } #ifdef Py_DEBUG unicode_fill_invalid((PyObject*)unicode, 0); @@ -1530,7 +1345,7 @@ _copy_characters(PyObject *to, Py_ssize_t to_start, PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many, int check_maxchar) { - unsigned int from_kind, to_kind; + int from_kind, to_kind; const void *from_data; void *to_data; @@ -1538,11 +1353,9 @@ _copy_characters(PyObject *to, Py_ssize_t to_start, assert(0 <= from_start); assert(0 <= to_start); assert(PyUnicode_Check(from)); - assert(PyUnicode_IS_READY(from)); assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); assert(PyUnicode_Check(to)); - assert(PyUnicode_IS_READY(to)); assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); if (how_many == 0) @@ -1687,11 +1500,6 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, return -1; } - if (PyUnicode_READY(from) == -1) - return -1; - if (PyUnicode_READY(to) == -1) - return -1; - if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return -1; @@ -1776,135 +1584,6 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, return 0; } -int -_PyUnicode_Ready(PyObject *unicode) -{ - wchar_t *end; - Py_UCS4 maxchar = 0; - Py_ssize_t num_surrogates; -#if SIZEOF_WCHAR_T == 2 - Py_ssize_t length_wo_surrogates; -#endif - - /* _PyUnicode_Ready() is only intended for old-style API usage where - strings were created using _PyObject_New() and where no canonical - representation (the str field) has been set yet aka strings - which are not yet ready. */ - assert(_PyUnicode_CHECK(unicode)); - assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); - assert(_PyUnicode_WSTR(unicode) != NULL); - assert(_PyUnicode_DATA_ANY(unicode) == NULL); - assert(_PyUnicode_UTF8(unicode) == NULL); - /* Actually, it should neither be interned nor be anything else: */ - assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); - - end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); - if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, - &maxchar, &num_surrogates) == -1) - return -1; - - if (maxchar < 256) { - _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1); - if (!_PyUnicode_DATA_ANY(unicode)) { - PyErr_NoMemory(); - return -1; - } - _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, - _PyUnicode_WSTR(unicode), end, - PyUnicode_1BYTE_DATA(unicode)); - PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; - _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); - _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; - if (maxchar < 128) { - _PyUnicode_STATE(unicode).ascii = 1; - _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); - _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); - } - else { - _PyUnicode_STATE(unicode).ascii = 0; - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; - } - PyObject_Free(_PyUnicode_WSTR(unicode)); - _PyUnicode_WSTR(unicode) = NULL; - _PyUnicode_WSTR_LENGTH(unicode) = 0; - } - /* In this case we might have to convert down from 4-byte native - wchar_t to 2-byte unicode. */ - else if (maxchar < 65536) { - assert(num_surrogates == 0 && - "FindMaxCharAndNumSurrogatePairs() messed up"); - -#if SIZEOF_WCHAR_T == 2 - /* We can share representations and are done. */ - _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); - PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; - _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); - _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; -#else - /* sizeof(wchar_t) == 4 */ - _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc( - 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); - if (!_PyUnicode_DATA_ANY(unicode)) { - PyErr_NoMemory(); - return -1; - } - _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, - _PyUnicode_WSTR(unicode), end, - PyUnicode_2BYTE_DATA(unicode)); - PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; - _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); - _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; - PyObject_Free(_PyUnicode_WSTR(unicode)); - _PyUnicode_WSTR(unicode) = NULL; - _PyUnicode_WSTR_LENGTH(unicode) = 0; -#endif - } - /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */ - else { -#if SIZEOF_WCHAR_T == 2 - /* in case the native representation is 2-bytes, we need to allocate a - new normalized 4-byte version. */ - length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; - if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { - PyErr_NoMemory(); - return -1; - } - _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1)); - if (!_PyUnicode_DATA_ANY(unicode)) { - PyErr_NoMemory(); - return -1; - } - _PyUnicode_LENGTH(unicode) = length_wo_surrogates; - _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; - /* unicode_convert_wchar_to_ucs4() requires a ready string */ - _PyUnicode_STATE(unicode).ready = 1; - unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); - PyObject_Free(_PyUnicode_WSTR(unicode)); - _PyUnicode_WSTR(unicode) = NULL; - _PyUnicode_WSTR_LENGTH(unicode) = 0; -#else - assert(num_surrogates == 0); - - _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); - _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; - _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; -#endif - PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; - } - _PyUnicode_STATE(unicode).ready = 1; - assert(_PyUnicode_CheckConsistency(unicode, 1)); - return 0; -} - static void unicode_dealloc(PyObject *unicode) { @@ -1913,38 +1592,15 @@ unicode_dealloc(PyObject *unicode) _Py_FatalRefcountError("deallocating an Unicode singleton"); } #endif - - switch (PyUnicode_CHECK_INTERNED(unicode)) { - case SSTATE_NOT_INTERNED: - break; - case SSTATE_INTERNED_MORTAL: + /* This should never get called, but we also don't want to SEGV if + * we accidentally decref an immortal string out of existence. Since + * the string is an immortal object, just re-set the reference count. + */ + if (PyUnicode_CHECK_INTERNED(unicode) + || _PyUnicode_STATE(unicode).statically_allocated) { - /* Revive the dead object temporarily. PyDict_DelItem() removes two - references (key and value) which were ignored by - PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2 - to prevent calling unicode_dealloc() again. Adjust refcnt after - PyDict_DelItem(). */ - assert(Py_REFCNT(unicode) == 0); - Py_SET_REFCNT(unicode, 3); - if (PyDict_DelItem(interned, unicode) != 0) { - _PyErr_WriteUnraisableMsg("deletion of interned string failed", - NULL); - } - assert(Py_REFCNT(unicode) == 1); - Py_SET_REFCNT(unicode, 0); - break; - } - - case SSTATE_INTERNED_IMMORTAL: - _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died"); - break; - - default: - Py_UNREACHABLE(); - } - - if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { - PyObject_Free(_PyUnicode_WSTR(unicode)); + _Py_SetImmortal(unicode); + return; } if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { PyObject_Free(_PyUnicode_UTF8(unicode)); @@ -1965,7 +1621,7 @@ unicode_is_singleton(PyObject *unicode) } PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode); - if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) { + if (ascii->length == 1) { Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); if (ch < 256 && LATIN1(ch) == unicode) { return 1; @@ -2007,10 +1663,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) assert(PyUnicode_Check(unicode)); assert(0 <= length); - if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) - old_length = PyUnicode_WSTR_LENGTH(unicode); - else - old_length = PyUnicode_GET_LENGTH(unicode); + old_length = PyUnicode_GET_LENGTH(unicode); if (old_length == length) return 0; @@ -2064,7 +1717,7 @@ static void unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str, Py_ssize_t len) { - enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + int kind = PyUnicode_KIND(unicode); const void *data = PyUnicode_DATA(unicode); const char *end = str + len; @@ -2139,28 +1792,6 @@ unicode_char(Py_UCS4 ch) return unicode; } -PyObject * -PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) -{ - if (u == NULL) { - if (size > 0) { - if (PyErr_WarnEx(PyExc_DeprecationWarning, - "PyUnicode_FromUnicode(NULL, size) is deprecated; " - "use PyUnicode_New() instead", 1) < 0) { - return NULL; - } - } - return (PyObject*)_PyUnicode_New(size); - } - - if (size < 0) { - PyErr_BadInternalCall(); - return NULL; - } - - return PyUnicode_FromWideChar(u, size); -} - PyObject * PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) { @@ -2254,16 +1885,12 @@ PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) if (u != NULL) { return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); } - else { - if (size > 0) { - if (PyErr_WarnEx(PyExc_DeprecationWarning, - "PyUnicode_FromStringAndSize(NULL, size) is deprecated; " - "use PyUnicode_New() instead", 1) < 0) { - return NULL; - } - } - return (PyObject *)_PyUnicode_New(size); + if (size > 0) { + PyErr_SetString(PyExc_SystemError, + "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize"); + return NULL; } + return unicode_new_empty(); } PyObject * @@ -2291,7 +1918,7 @@ _PyUnicode_FromId(_Py_Identifier *id) Py_ssize_t index = _Py_atomic_size_get(&id->index); if (index < 0) { - struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids; + struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids; PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK); // Check again to detect concurrent access. Another thread can have @@ -2382,7 +2009,7 @@ _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) } static Py_UCS4 -kind_maxchar_limit(unsigned int kind) +kind_maxchar_limit(int kind) { switch (kind) { case PyUnicode_1BYTE_KIND: @@ -2496,10 +2123,9 @@ PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) Py_UCS4 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) { - enum PyUnicode_Kind kind; + int kind; const void *startptr, *endptr; - assert(PyUnicode_IS_READY(unicode)); assert(0 <= start); assert(end <= PyUnicode_GET_LENGTH(unicode)); assert(start <= end); @@ -2538,11 +2164,10 @@ unicode_adjust_maxchar(PyObject **p_unicode) PyObject *unicode, *copy; Py_UCS4 max_char; Py_ssize_t len; - unsigned int kind; + int kind; assert(p_unicode != NULL); unicode = *p_unicode; - assert(PyUnicode_IS_READY(unicode)); if (PyUnicode_IS_ASCII(unicode)) return; @@ -2586,8 +2211,6 @@ _PyUnicode_Copy(PyObject *unicode) PyErr_BadInternalCall(); return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; length = PyUnicode_GET_LENGTH(unicode); copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); @@ -2606,7 +2229,7 @@ _PyUnicode_Copy(PyObject *unicode) character. Return NULL on error. */ static void* -unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind) +unicode_askind(int skind, void const *data, Py_ssize_t len, int kind) { void *result; @@ -2656,8 +2279,6 @@ as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, int kind; const void *data; Py_ssize_t len, targetlen; - if (PyUnicode_READY(string) == -1) - return NULL; kind = PyUnicode_KIND(string); data = PyUnicode_DATA(string); len = PyUnicode_GET_LENGTH(string); @@ -2716,21 +2337,19 @@ PyUnicode_AsUCS4Copy(PyObject *string) return as_ucs4(string, NULL, 0, 1); } -/* maximum number of characters required for output of %lld or %p. - We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, - plus 1 for the sign. 53/22 is an upper bound for log10(256). */ -#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) +/* maximum number of characters required for output of %jo or %jd or %p. + We need at most ceil(log8(256)*sizeof(intmax_t)) digits, + plus 1 for the sign, plus 2 for the 0x prefix (for %p), + plus 1 for the terminal NUL. */ +#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3) static int unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, - Py_ssize_t width, Py_ssize_t precision) + Py_ssize_t width, Py_ssize_t precision, int flags) { Py_ssize_t length, fill, arglen; Py_UCS4 maxchar; - if (PyUnicode_READY(str) == -1) - return -1; - length = PyUnicode_GET_LENGTH(str); if ((precision == -1 || precision >= length) && width <= length) @@ -2748,8 +2367,8 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) return -1; - if (width > length) { - fill = width - length; + fill = Py_MAX(width - length, 0); + if (fill && !(flags & F_LJUST)) { if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) return -1; writer->pos += fill; @@ -2758,12 +2377,19 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, str, 0, length); writer->pos += length; + + if (fill && (flags & F_LJUST)) { + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) + return -1; + writer->pos += fill; + } + return 0; } static int unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, - Py_ssize_t width, Py_ssize_t precision) + Py_ssize_t width, Py_ssize_t precision, int flags) { /* UTF-8 */ Py_ssize_t length; @@ -2783,36 +2409,93 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, if (unicode == NULL) return -1; - res = unicode_fromformat_write_str(writer, unicode, width, -1); + res = unicode_fromformat_write_str(writer, unicode, width, -1, flags); + Py_DECREF(unicode); + return res; +} + +static int +unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str, + Py_ssize_t width, Py_ssize_t precision, int flags) +{ + /* UTF-8 */ + Py_ssize_t length; + PyObject *unicode; + int res; + + if (precision == -1) { + length = wcslen(str); + } + else { + length = 0; + while (length < precision && str[length]) { + length++; + } + } + unicode = PyUnicode_FromWideChar(str, length); + if (unicode == NULL) + return -1; + + res = unicode_fromformat_write_str(writer, unicode, width, -1, flags); Py_DECREF(unicode); return res; } +#define F_LONG 1 +#define F_LONGLONG 2 +#define F_SIZE 3 +#define F_PTRDIFF 4 +#define F_INTMAX 5 +static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"}; +static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"}; +static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"}; +static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"}; +static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"}; + static const char* unicode_fromformat_arg(_PyUnicodeWriter *writer, const char *f, va_list *vargs) { const char *p; Py_ssize_t len; - int zeropad; + int flags = 0; Py_ssize_t width; Py_ssize_t precision; - int longflag; - int longlongflag; - int size_tflag; - Py_ssize_t fill; p = f; f++; - zeropad = 0; - if (*f == '0') { - zeropad = 1; + if (*f == '%') { + if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) + return NULL; f++; + return f; + } + + /* Parse flags. Example: "%-i" => flags=F_LJUST. */ + /* Flags '+', ' ' and '#' are not particularly useful. + * They are not worth the implementation and maintenance costs. + * In addition, '#' should add "0" for "o" conversions for compatibility + * with printf, but it would confuse Python users. */ + while (1) { + switch (*f++) { + case '-': flags |= F_LJUST; continue; + case '0': flags |= F_ZERO; continue; + } + f--; + break; } /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ width = -1; - if (Py_ISDIGIT((unsigned)*f)) { + if (*f == '*') { + width = va_arg(*vargs, int); + if (width < 0) { + flags |= F_LJUST; + width = -width; + } + f++; + } + else if (Py_ISDIGIT((unsigned)*f)) { width = *f - '0'; f++; while (Py_ISDIGIT((unsigned)*f)) { @@ -2828,7 +2511,14 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, precision = -1; if (*f == '.') { f++; - if (Py_ISDIGIT((unsigned)*f)) { + if (*f == '*') { + precision = va_arg(*vargs, int); + if (precision < 0) { + precision = -2; + } + f++; + } + else if (Py_ISDIGIT((unsigned)*f)) { precision = (*f - '0'); f++; while (Py_ISDIGIT((unsigned)*f)) { @@ -2841,40 +2531,49 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, f++; } } - if (*f == '%') { - /* "%.3%s" => f points to "3" */ - f--; - } - } - if (*f == '\0') { - /* bogus format "%.123" => go backward, f points to "3" */ - f--; } - /* Handle %ld, %lu, %lld and %llu. */ - longflag = 0; - longlongflag = 0; - size_tflag = 0; + int sizemod = 0; if (*f == 'l') { - if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { - longflag = 1; - ++f; - } - else if (f[1] == 'l' && - (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { - longlongflag = 1; + if (f[1] == 'l') { + sizemod = F_LONGLONG; f += 2; } + else { + sizemod = F_LONG; + ++f; + } } - /* handle the size_t flag. */ - else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { - size_tflag = 1; + else if (*f == 'z') { + sizemod = F_SIZE; ++f; } - - if (f[1] == '\0') + else if (*f == 't') { + sizemod = F_PTRDIFF; + ++f; + } + else if (*f == 'j') { + sizemod = F_INTMAX; + ++f; + } + if (f[0] != '\0' && f[1] == '\0') writer->overallocate = 0; + switch (*f) { + case 'd': case 'i': case 'o': case 'u': case 'x': case 'X': + break; + case 'c': case 'p': + if (sizemod || width >= 0 || precision >= 0) goto invalid_format; + break; + case 's': + case 'V': + if (sizemod && sizemod != F_LONG) goto invalid_format; + break; + default: + if (sizemod) goto invalid_format; + break; + } + switch (*f) { case 'c': { @@ -2889,78 +2588,98 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, break; } - case 'i': - case 'd': - case 'u': - case 'x': + case 'd': case 'i': + case 'o': case 'u': case 'x': case 'X': { /* used by sprintf */ - char buffer[MAX_LONG_LONG_CHARS]; - Py_ssize_t arglen; - - if (*f == 'u') { - if (longflag) { - len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long)); - } - else if (longlongflag) { - len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long)); - } - else if (size_tflag) { - len = sprintf(buffer, "%zu", va_arg(*vargs, size_t)); - } - else { - len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int)); - } - } - else if (*f == 'x') { - len = sprintf(buffer, "%x", va_arg(*vargs, int)); - } - else { - if (longflag) { - len = sprintf(buffer, "%li", va_arg(*vargs, long)); - } - else if (longlongflag) { - len = sprintf(buffer, "%lli", va_arg(*vargs, long long)); - } - else if (size_tflag) { - len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t)); - } - else { - len = sprintf(buffer, "%i", va_arg(*vargs, int)); - } + char buffer[MAX_INTMAX_CHARS]; + const char *fmt = NULL; + switch (*f) { + case 'o': fmt = formats_o[sizemod]; break; + case 'u': fmt = formats_u[sizemod]; break; + case 'x': fmt = formats_x[sizemod]; break; + case 'X': fmt = formats_X[sizemod]; break; + default: fmt = formats[sizemod]; break; + } + int issigned = (*f == 'd' || *f == 'i'); + switch (sizemod) { + case F_LONG: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, long)) : + sprintf(buffer, fmt, va_arg(*vargs, unsigned long)); + break; + case F_LONGLONG: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, long long)) : + sprintf(buffer, fmt, va_arg(*vargs, unsigned long long)); + break; + case F_SIZE: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) : + sprintf(buffer, fmt, va_arg(*vargs, size_t)); + break; + case F_PTRDIFF: + len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t)); + break; + case F_INTMAX: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) : + sprintf(buffer, fmt, va_arg(*vargs, uintmax_t)); + break; + default: + len = issigned ? + sprintf(buffer, fmt, va_arg(*vargs, int)) : + sprintf(buffer, fmt, va_arg(*vargs, unsigned int)); + break; } assert(len >= 0); - if (precision < len) - precision = len; + int sign = (buffer[0] == '-'); + len -= sign; + + precision = Py_MAX(precision, len); + width = Py_MAX(width, precision + sign); + if ((flags & F_ZERO) && !(flags & F_LJUST)) { + precision = width - sign; + } + + Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0); + Py_ssize_t zeropad = Py_MAX(precision - len, 0); - arglen = Py_MAX(precision, width); - if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) + if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1) return NULL; - if (width > precision) { - Py_UCS4 fillchar; - fill = width - precision; - fillchar = zeropad?'0':' '; - if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) + if (spacepad && !(flags & F_LJUST)) { + if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1) + return NULL; + writer->pos += spacepad; + } + + if (sign) { + if (_PyUnicodeWriter_WriteChar(writer, '-') == -1) return NULL; - writer->pos += fill; } - if (precision > len) { - fill = precision - len; - if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) + + if (zeropad) { + if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1) return NULL; - writer->pos += fill; + writer->pos += zeropad; } - if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) + if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0) return NULL; + + if (spacepad && (flags & F_LJUST)) { + if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1) + return NULL; + writer->pos += spacepad; + } break; } case 'p': { - char number[MAX_LONG_LONG_CHARS]; + char number[MAX_INTMAX_CHARS]; len = sprintf(number, "%p", va_arg(*vargs, void*)); assert(len >= 0); @@ -2983,10 +2702,17 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, case 's': { - /* UTF-8 */ - const char *s = va_arg(*vargs, const char*); - if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) - return NULL; + if (sizemod) { + const wchar_t *s = va_arg(*vargs, const wchar_t*); + if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0) + return NULL; + } + else { + /* UTF-8 */ + const char *s = va_arg(*vargs, const char*); + if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0) + return NULL; + } break; } @@ -2995,7 +2721,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, PyObject *obj = va_arg(*vargs, PyObject *); assert(obj && _PyUnicode_CHECK(obj)); - if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1) return NULL; break; } @@ -3003,15 +2729,27 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, case 'V': { PyObject *obj = va_arg(*vargs, PyObject *); - const char *str = va_arg(*vargs, const char *); + const char *str; + const wchar_t *wstr; + if (sizemod) { + wstr = va_arg(*vargs, const wchar_t*); + } + else { + str = va_arg(*vargs, const char *); + } if (obj) { assert(_PyUnicode_CHECK(obj)); - if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1) + return NULL; + } + else if (sizemod) { + assert(wstr != NULL); + if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0) return NULL; } else { assert(str != NULL); - if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) + if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0) return NULL; } break; @@ -3025,7 +2763,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, str = PyObject_Str(obj); if (!str) return NULL; - if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { + if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) { Py_DECREF(str); return NULL; } @@ -3041,7 +2779,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, repr = PyObject_Repr(obj); if (!repr) return NULL; - if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { + if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) { Py_DECREF(repr); return NULL; } @@ -3057,7 +2795,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, ascii = PyObject_ASCII(obj); if (!ascii) return NULL; - if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { + if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) { Py_DECREF(ascii); return NULL; } @@ -3065,21 +2803,10 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, break; } - case '%': - if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) - return NULL; - break; - default: - /* if we stumble upon an unknown formatting code, copy the rest - of the format string to the output string. (we cannot just - skip the code, since there's no way to know what's in the - argument list) */ - len = strlen(p); - if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) - return NULL; - f = p+len; - return f; + invalid_format: + PyErr_Format(PyExc_SystemError, "invalid format string: %s", p); + return NULL; } f++; @@ -3149,11 +2876,7 @@ PyUnicode_FromFormat(const char *format, ...) PyObject* ret; va_list vargs; -#ifdef HAVE_STDARG_PROTOTYPES va_start(vargs, format); -#else - va_start(vargs); -#endif ret = PyUnicode_FromFormatV(format, vargs); va_end(vargs); return ret; @@ -3167,13 +2890,6 @@ unicode_get_widechar_size(PyObject *unicode) assert(unicode != NULL); assert(_PyUnicode_CHECK(unicode)); -#if USE_UNICODE_WCHAR_CACHE - if (_PyUnicode_WSTR(unicode) != NULL) { - return PyUnicode_WSTR_LENGTH(unicode); - } -#endif /* USE_UNICODE_WCHAR_CACHE */ - assert(PyUnicode_IS_READY(unicode)); - res = _PyUnicode_LENGTH(unicode); #if SIZEOF_WCHAR_T == 2 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { @@ -3195,19 +2911,10 @@ unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size) assert(unicode != NULL); assert(_PyUnicode_CHECK(unicode)); -#if USE_UNICODE_WCHAR_CACHE - const wchar_t *wstr = _PyUnicode_WSTR(unicode); - if (wstr != NULL) { - memcpy(w, wstr, size * sizeof(wchar_t)); - return; - } -#else /* USE_UNICODE_WCHAR_CACHE */ if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) { memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t)); return; } -#endif /* USE_UNICODE_WCHAR_CACHE */ - assert(PyUnicode_IS_READY(unicode)); if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode); @@ -3312,7 +3019,7 @@ PyUnicode_AsWideCharString(PyObject *unicode, } buflen = unicode_get_widechar_size(unicode); - buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1)); + buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1)); if (buffer == NULL) { PyErr_NoMemory(); return NULL; @@ -3348,26 +3055,16 @@ _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr) { wchar_t **p = (wchar_t **)ptr; if (obj == NULL) { -#if !USE_UNICODE_WCHAR_CACHE PyMem_Free(*p); -#endif /* USE_UNICODE_WCHAR_CACHE */ *p = NULL; return 1; } if (PyUnicode_Check(obj)) { -#if USE_UNICODE_WCHAR_CACHE - *p = (wchar_t *)_PyUnicode_AsUnicode(obj); - if (*p == NULL) { - return 0; - } - return 1; -#else /* USE_UNICODE_WCHAR_CACHE */ *p = PyUnicode_AsWideCharString(obj, NULL); if (*p == NULL) { return 0; } return Py_CLEANUP_SUPPORTED; -#endif /* USE_UNICODE_WCHAR_CACHE */ } PyErr_Format(PyExc_TypeError, "argument must be str, not %.50s", @@ -3380,9 +3077,7 @@ _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr) { wchar_t **p = (wchar_t **)ptr; if (obj == NULL) { -#if !USE_UNICODE_WCHAR_CACHE PyMem_Free(*p); -#endif /* USE_UNICODE_WCHAR_CACHE */ *p = NULL; return 1; } @@ -3391,19 +3086,11 @@ _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr) return 1; } if (PyUnicode_Check(obj)) { -#if USE_UNICODE_WCHAR_CACHE - *p = (wchar_t *)_PyUnicode_AsUnicode(obj); - if (*p == NULL) { - return 0; - } - return 1; -#else /* USE_UNICODE_WCHAR_CACHE */ *p = PyUnicode_AsWideCharString(obj, NULL); if (*p == NULL) { return 0; } return Py_CLEANUP_SUPPORTED; -#endif /* USE_UNICODE_WCHAR_CACHE */ } PyErr_Format(PyExc_TypeError, "argument must be str or None, not %.50s", @@ -3429,10 +3116,7 @@ PyUnicode_FromObject(PyObject *obj) /* XXX Perhaps we should make this API an alias of PyObject_Str() instead ?! */ if (PyUnicode_CheckExact(obj)) { - if (PyUnicode_READY(obj) == -1) - return NULL; - Py_INCREF(obj); - return obj; + return Py_NewRef(obj); } if (PyUnicode_Check(obj)) { /* For a Unicode subtype that's not a Unicode object, @@ -4103,48 +3787,25 @@ PyUnicode_FSConverter(PyObject* arg, void* addr) int PyUnicode_FSDecoder(PyObject* arg, void* addr) { - int is_buffer = 0; - PyObject *path = NULL; - PyObject *output = NULL; if (arg == NULL) { Py_DECREF(*(PyObject**)addr); *(PyObject**)addr = NULL; return 1; } - is_buffer = PyObject_CheckBuffer(arg); - if (!is_buffer) { - path = PyOS_FSPath(arg); - if (path == NULL) { - return 0; - } - } - else { - path = arg; - Py_INCREF(arg); + PyObject *path = PyOS_FSPath(arg); + if (path == NULL) { + return 0; } + PyObject *output = NULL; if (PyUnicode_Check(path)) { output = path; } - else if (PyBytes_Check(path) || is_buffer) { - PyObject *path_bytes = NULL; - - if (!PyBytes_Check(path) && - PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "path should be string, bytes, or os.PathLike, not %.200s", - Py_TYPE(arg)->tp_name)) { - Py_DECREF(path); - return 0; - } - path_bytes = PyBytes_FromObject(path); + else if (PyBytes_Check(path)) { + output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path), + PyBytes_GET_SIZE(path)); Py_DECREF(path); - if (!path_bytes) { - return 0; - } - output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes), - PyBytes_GET_SIZE(path_bytes)); - Py_DECREF(path_bytes); if (!output) { return 0; } @@ -4156,10 +3817,7 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) Py_DECREF(path); return 0; } - if (PyUnicode_READY(output) == -1) { - Py_DECREF(output); - return 0; - } + if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { PyErr_SetString(PyExc_ValueError, "embedded null character"); @@ -4180,8 +3838,6 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; if (PyUnicode_UTF8(unicode) == NULL) { if (unicode_fill_utf8(unicode) == -1) { @@ -4200,85 +3856,22 @@ PyUnicode_AsUTF8(PyObject *unicode) return PyUnicode_AsUTF8AndSize(unicode, NULL); } -Py_UNICODE * -PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) -{ - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - Py_UNICODE *w = _PyUnicode_WSTR(unicode); - if (w == NULL) { - /* Non-ASCII compact unicode object */ - assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND); - assert(PyUnicode_IS_READY(unicode)); - - Py_ssize_t wlen = unicode_get_widechar_size(unicode); - if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { - PyErr_NoMemory(); - return NULL; - } - w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1)); - if (w == NULL) { - PyErr_NoMemory(); - return NULL; - } - unicode_copy_as_widechar(unicode, w, wlen + 1); - _PyUnicode_WSTR(unicode) = w; - if (!PyUnicode_IS_COMPACT_ASCII(unicode)) { - _PyUnicode_WSTR_LENGTH(unicode) = wlen; - } - } - if (size != NULL) - *size = PyUnicode_WSTR_LENGTH(unicode); - return w; -} - -/* Deprecated APIs */ - -_Py_COMP_DIAG_PUSH -_Py_COMP_DIAG_IGNORE_DEPR_DECLS - -Py_UNICODE * -PyUnicode_AsUnicode(PyObject *unicode) -{ - return PyUnicode_AsUnicodeAndSize(unicode, NULL); -} - -const Py_UNICODE * -_PyUnicode_AsUnicode(PyObject *unicode) -{ - Py_ssize_t size; - const Py_UNICODE *wstr; - - wstr = PyUnicode_AsUnicodeAndSize(unicode, &size); - if (wstr && wcslen(wstr) != (size_t)size) { - PyErr_SetString(PyExc_ValueError, "embedded null character"); - return NULL; - } - return wstr; -} - +/* +PyUnicode_GetSize() has been deprecated since Python 3.3 +because it returned length of Py_UNICODE. -Py_ssize_t +But this function is part of stable abi, because it don't +include Py_UNICODE in signature and it was not excluded from +stable abi in PEP 384. +*/ +PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - goto onError; - } - if (_PyUnicode_WSTR(unicode) == NULL) { - if (PyUnicode_AsUnicode(unicode) == NULL) - goto onError; - } - return PyUnicode_WSTR_LENGTH(unicode); - - onError: + PyErr_SetString(PyExc_RuntimeError, + "PyUnicode_GetSize has been removed."); return -1; } -_Py_COMP_DIAG_POP - Py_ssize_t PyUnicode_GetLength(PyObject *unicode) { @@ -4286,8 +3879,6 @@ PyUnicode_GetLength(PyObject *unicode) PyErr_BadArgument(); return -1; } - if (PyUnicode_READY(unicode) == -1) - return -1; return PyUnicode_GET_LENGTH(unicode); } @@ -4301,9 +3892,6 @@ PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) PyErr_BadArgument(); return (Py_UCS4)-1; } - if (PyUnicode_READY(unicode) == -1) { - return (Py_UCS4)-1; - } if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return (Py_UCS4)-1; @@ -4320,7 +3908,6 @@ PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) PyErr_BadArgument(); return -1; } - assert(PyUnicode_IS_READY(unicode)); if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return -1; @@ -4453,19 +4040,10 @@ unicode_decode_call_errorhandler_wchar( goto onError; } -#if USE_UNICODE_WCHAR_CACHE -_Py_COMP_DIAG_PUSH -_Py_COMP_DIAG_IGNORE_DEPR_DECLS - repwlen = PyUnicode_GetSize(repunicode); - if (repwlen < 0) - goto onError; -_Py_COMP_DIAG_POP -#else /* USE_UNICODE_WCHAR_CACHE */ repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0); if (repwlen < 0) goto onError; repwlen--; -#endif /* USE_UNICODE_WCHAR_CACHE */ /* need more space? (at least enough for what we have+the replacement+the rest of the string (starting at the new input position), so we won't have to check space @@ -4915,8 +4493,6 @@ _PyUnicode_EncodeUTF7(PyObject *str, char * out; const char * start; - if (PyUnicode_READY(str) == -1) - return NULL; kind = PyUnicode_KIND(str); data = PyUnicode_DATA(str); len = PyUnicode_GET_LENGTH(str); @@ -5548,14 +5124,11 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; - if (PyUnicode_UTF8(unicode)) return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), PyUnicode_UTF8_LENGTH(unicode)); - enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + int kind = PyUnicode_KIND(unicode); const void *data = PyUnicode_DATA(unicode); Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); @@ -5591,7 +5164,7 @@ unicode_fill_utf8(PyObject *unicode) /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ assert(!PyUnicode_IS_ASCII(unicode)); - enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + int kind = PyUnicode_KIND(unicode); const void *data = PyUnicode_DATA(unicode); Py_ssize_t size = PyUnicode_GET_LENGTH(unicode); @@ -5726,7 +5299,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); if (e - q >= 4) { - enum PyUnicode_Kind kind = writer.kind; + int kind = writer.kind; void *data = writer.data; const unsigned char *last = e - 4; Py_ssize_t pos = writer.pos; @@ -5811,7 +5384,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, const char *errors, int byteorder) { - enum PyUnicode_Kind kind; + int kind; const void *data; Py_ssize_t len; PyObject *v; @@ -5831,8 +5404,6 @@ _PyUnicode_EncodeUTF32(PyObject *str, PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(str) == -1) - return NULL; kind = PyUnicode_KIND(str); data = PyUnicode_DATA(str); len = PyUnicode_GET_LENGTH(str); @@ -5899,8 +5470,6 @@ _PyUnicode_EncodeUTF32(PyObject *str, } else { assert(PyUnicode_Check(rep)); - if (PyUnicode_READY(rep) < 0) - goto error; moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, @@ -6132,7 +5701,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, const char *errors, int byteorder) { - enum PyUnicode_Kind kind; + int kind; const void *data; Py_ssize_t len; PyObject *v; @@ -6153,8 +5722,6 @@ _PyUnicode_EncodeUTF16(PyObject *str, PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(str) == -1) - return NULL; kind = PyUnicode_KIND(str); data = PyUnicode_DATA(str); len = PyUnicode_GET_LENGTH(str); @@ -6238,8 +5805,6 @@ _PyUnicode_EncodeUTF16(PyObject *str, } else { assert(PyUnicode_Check(rep)); - if (PyUnicode_READY(rep) < 0) - goto error; moreunits = repsize = PyUnicode_GET_LENGTH(rep); if (!PyUnicode_IS_ASCII(rep)) { raise_encode_exception(&exc, encoding, @@ -6303,8 +5868,6 @@ PyUnicode_AsUTF16String(PyObject *unicode) /* --- Unicode Escape Codec ----------------------------------------------- */ -static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; - PyObject * _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, Py_ssize_t size, @@ -6317,6 +5880,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, const char *end; PyObject *errorHandler = NULL; PyObject *exc = NULL; + _PyUnicode_Name_CAPI *ucnhash_capi; + PyInterpreterState *interp = _PyInterpreterState_Get(); // so we can remember if we've seen an invalid escape char or not *first_invalid_escape = NULL; @@ -6464,6 +6029,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, /* \N{name} */ case 'N': + ucnhash_capi = interp->unicode.ucnhash_capi; if (ucnhash_capi == NULL) { /* load the unicode data module */ ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import( @@ -6475,6 +6041,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, ); goto onError; } + interp->unicode.ucnhash_capi = ucnhash_capi; } message = "malformed \\N character escape"; @@ -6601,7 +6168,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) Py_ssize_t i, len; PyObject *repr; char *p; - enum PyUnicode_Kind kind; + int kind; const void *data; Py_ssize_t expandsize; @@ -6617,9 +6184,6 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(unicode) == -1) { - return NULL; - } len = PyUnicode_GET_LENGTH(unicode); if (len == 0) { @@ -6874,9 +6438,6 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(unicode) == -1) { - return NULL; - } kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); len = PyUnicode_GET_LENGTH(unicode); @@ -7013,8 +6574,6 @@ unicode_encode_call_errorhandler(const char *errors, return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; len = PyUnicode_GET_LENGTH(unicode); make_encode_exception(exceptionObject, @@ -7072,8 +6631,6 @@ unicode_encode_ucs1(PyObject *unicode, /* output object */ _PyBytesWriter writer; - if (PyUnicode_READY(unicode) == -1) - return NULL; size = PyUnicode_GET_LENGTH(unicode); kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); @@ -7192,9 +6749,6 @@ unicode_encode_ucs1(PyObject *unicode, else { assert(PyUnicode_Check(rep)); - if (PyUnicode_READY(rep) < 0) - goto onError; - if (limit == 256 ? PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND : !PyUnicode_IS_ASCII(rep)) @@ -7241,8 +6795,6 @@ _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; /* Fast path: if it is a one-byte string, construct bytes object directly. */ if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) @@ -7367,8 +6919,6 @@ _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; /* Fast path: if it is an ASCII-only string, construct bytes object directly. Else defer to above function to raise the exception. */ if (PyUnicode_IS_ASCII(unicode)) @@ -7756,22 +7306,11 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, substring = PyUnicode_Substring(unicode, offset, offset+len); if (substring == NULL) return -1; -#if USE_UNICODE_WCHAR_CACHE -_Py_COMP_DIAG_PUSH -_Py_COMP_DIAG_IGNORE_DEPR_DECLS - p = PyUnicode_AsUnicodeAndSize(substring, &size); - if (p == NULL) { - Py_DECREF(substring); - return -1; - } -_Py_COMP_DIAG_POP -#else /* USE_UNICODE_WCHAR_CACHE */ p = PyUnicode_AsWideCharString(substring, &size); Py_CLEAR(substring); if (p == NULL) { return -1; } -#endif /* USE_UNICODE_WCHAR_CACHE */ assert(size <= INT_MAX); /* First get the size of the result */ @@ -7822,11 +7361,7 @@ _Py_COMP_DIAG_POP ret = 0; done: -#if USE_UNICODE_WCHAR_CACHE - Py_DECREF(substring); -#else /* USE_UNICODE_WCHAR_CACHE */ PyMem_Free(p); -#endif /* USE_UNICODE_WCHAR_CACHE */ return ret; error: @@ -7976,14 +7511,9 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, } else { Py_ssize_t i; - enum PyUnicode_Kind kind; + int kind; const void *data; - if (PyUnicode_READY(rep) == -1) { - Py_DECREF(rep); - goto error; - } - outsize = PyUnicode_GET_LENGTH(rep); morebytes += outsize; if (morebytes > 0) { @@ -8044,8 +7574,6 @@ encode_code_page(int code_page, return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; len = PyUnicode_GET_LENGTH(unicode); if (code_page < 0) { @@ -8122,14 +7650,11 @@ charmap_decode_string(const char *s, Py_ssize_t startinpos, endinpos; PyObject *errorHandler = NULL, *exc = NULL; Py_ssize_t maplen; - enum PyUnicode_Kind mapkind; + int mapkind; const void *mapdata; Py_UCS4 x; unsigned char ch; - if (PyUnicode_READY(mapping) == -1) - return -1; - maplen = PyUnicode_GET_LENGTH(mapping); mapdata = PyUnicode_DATA(mapping); mapkind = PyUnicode_KIND(mapping); @@ -8163,7 +7688,7 @@ charmap_decode_string(const char *s, while (s < e) { if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { - enum PyUnicode_Kind outkind = writer->kind; + int outkind = writer->kind; const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata; if (outkind == PyUnicode_1BYTE_KIND) { Py_UCS1 *outdata = (Py_UCS1 *)writer->data; @@ -8282,8 +7807,6 @@ charmap_decode_mapping(const char *s, goto onError; } else if (PyUnicode_Check(item)) { - if (PyUnicode_READY(item) == -1) - goto onError; if (PyUnicode_GET_LENGTH(item) == 1) { Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); if (value == 0xFFFE) @@ -8689,7 +8212,7 @@ charmap_encoding_error( PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ Py_ssize_t size, repsize; Py_ssize_t newpos; - enum PyUnicode_Kind kind; + int kind; const void *data; Py_ssize_t index; /* startpos for collecting unencodable chars */ @@ -8702,8 +8225,6 @@ charmap_encoding_error( Py_UCS4 ch; int val; - if (PyUnicode_READY(unicode) == -1) - return -1; size = PyUnicode_GET_LENGTH(unicode); /* find all unencodable characters */ while (collendpos < size) { @@ -8799,10 +8320,6 @@ charmap_encoding_error( break; } /* generate replacement */ - if (PyUnicode_READY(repunicode) == -1) { - Py_DECREF(repunicode); - return -1; - } repsize = PyUnicode_GET_LENGTH(repunicode); data = PyUnicode_DATA(repunicode); kind = PyUnicode_KIND(repunicode); @@ -8843,8 +8360,6 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, const void *data; int kind; - if (PyUnicode_READY(unicode) == -1) - return NULL; size = PyUnicode_GET_LENGTH(unicode); data = PyUnicode_DATA(unicode); kind = PyUnicode_KIND(unicode); @@ -9123,10 +8638,6 @@ unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, else if (PyUnicode_Check(item)) { Py_UCS4 replace; - if (PyUnicode_READY(item) == -1) { - Py_DECREF(item); - return -1; - } if (PyUnicode_GET_LENGTH(item) != 1) goto exit; @@ -9223,8 +8734,6 @@ _PyUnicode_TranslateCharmap(PyObject *input, return NULL; } - if (PyUnicode_READY(input) == -1) - return NULL; data = PyUnicode_DATA(input); kind = PyUnicode_KIND(input); size = PyUnicode_GET_LENGTH(input); @@ -9240,8 +8749,6 @@ _PyUnicode_TranslateCharmap(PyObject *input, ignore = (errors != NULL && strcmp(errors, "ignore") == 0); - if (PyUnicode_READY(input) == -1) - return NULL; if (PyUnicode_IS_ASCII(input)) { res = unicode_fast_translate(input, mapping, &writer, ignore, &i); if (res < 0) { @@ -9337,12 +8844,9 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) PyErr_BadInternalCall(); return NULL; } - if (PyUnicode_READY(unicode) == -1) - return NULL; if (PyUnicode_IS_ASCII(unicode)) { /* If the string is already ASCII, just return the same string */ - Py_INCREF(unicode); - return unicode; + return Py_NewRef(unicode); } Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); @@ -9530,15 +9034,6 @@ _PyUnicode_InsertThousandsGrouping( assert(0 <= n_digits); assert(grouping != NULL); - if (digits != NULL) { - if (PyUnicode_READY(digits) == -1) { - return -1; - } - } - if (PyUnicode_READY(thousands_sep) == -1) { - return -1; - } - Py_ssize_t count = 0; Py_ssize_t n_zeros; int loop_broken = 0; @@ -9624,21 +9119,20 @@ _PyUnicode_InsertThousandsGrouping( return count; } - -Py_ssize_t -PyUnicode_Count(PyObject *str, - PyObject *substr, - Py_ssize_t start, - Py_ssize_t end) +static Py_ssize_t +unicode_count_impl(PyObject *str, + PyObject *substr, + Py_ssize_t start, + Py_ssize_t end) { + assert(PyUnicode_Check(str)); + assert(PyUnicode_Check(substr)); + Py_ssize_t result; int kind1, kind2; const void *buf1 = NULL, *buf2 = NULL; Py_ssize_t len1, len2; - if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) - return -1; - kind1 = PyUnicode_KIND(str); kind2 = PyUnicode_KIND(substr); if (kind1 < kind2) @@ -9658,18 +9152,13 @@ PyUnicode_Count(PyObject *str, goto onError; } + // We don't reuse `anylib_count` here because of the explicit casts. switch (kind1) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) - result = asciilib_count( - ((const Py_UCS1*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - else - result = ucs1lib_count( - ((const Py_UCS1*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); + result = ucs1lib_count( + ((const Py_UCS1*)buf1) + start, end - start, + buf2, len2, PY_SSIZE_T_MAX + ); break; case PyUnicode_2BYTE_KIND: result = ucs2lib_count( @@ -9699,6 +9188,18 @@ PyUnicode_Count(PyObject *str, return -1; } +Py_ssize_t +PyUnicode_Count(PyObject *str, + PyObject *substr, + Py_ssize_t start, + Py_ssize_t end) +{ + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) + return -1; + + return unicode_count_impl(str, substr, start, end); +} + Py_ssize_t PyUnicode_Find(PyObject *str, PyObject *substr, @@ -9719,8 +9220,6 @@ PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, { int kind; Py_ssize_t len, result; - if (PyUnicode_READY(str) == -1) - return -2; len = PyUnicode_GET_LENGTH(str); ADJUST_INDICES(start, end, len); if (end - start < 1) @@ -9749,10 +9248,6 @@ tailmatch(PyObject *self, Py_ssize_t i; Py_ssize_t end_sub; - if (PyUnicode_READY(self) == -1 || - PyUnicode_READY(substring) == -1) - return -1; - ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); end -= PyUnicode_GET_LENGTH(substring); if (end < start) @@ -10011,8 +9506,6 @@ case_operation(PyObject *self, void *outdata; Py_UCS4 maxchar = 0, *tmp, *tmpend; - assert(PyUnicode_IS_READY(self)); - kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); length = PyUnicode_GET_LENGTH(self); @@ -10085,7 +9578,7 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq int use_memcpy; unsigned char *res_data = NULL, *sep_data = NULL; PyObject *last_obj; - unsigned int kind = 0; + int kind = 0; /* If empty sequence, return u"". */ if (seqlen == 0) { @@ -10097,8 +9590,7 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq if (seqlen == 1) { if (PyUnicode_CheckExact(items[0])) { res = items[0]; - Py_INCREF(res); - return res; + return Py_NewRef(res); } seplen = 0; maxchar = 0; @@ -10121,8 +9613,6 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq Py_TYPE(separator)->tp_name); goto onError; } - if (PyUnicode_READY(separator)) - goto onError; sep = separator; seplen = PyUnicode_GET_LENGTH(separator); maxchar = PyUnicode_MAX_CHAR_VALUE(separator); @@ -10154,8 +9644,6 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq i, Py_TYPE(item)->tp_name); goto onError; } - if (PyUnicode_READY(item) == -1) - goto onError; add_sz = PyUnicode_GET_LENGTH(item); item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); maxchar = Py_MAX(maxchar, item_maxchar); @@ -10248,9 +9736,8 @@ void _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, Py_UCS4 fill_char) { - const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); + const int kind = PyUnicode_KIND(unicode); void *data = PyUnicode_DATA(unicode); - assert(PyUnicode_IS_READY(unicode)); assert(unicode_modifiable(unicode)); assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); assert(start >= 0); @@ -10268,8 +9755,6 @@ PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, PyErr_BadInternalCall(); return -1; } - if (PyUnicode_READY(unicode) == -1) - return -1; if (unicode_check_modifiable(unicode)) return -1; @@ -10378,53 +9863,53 @@ split(PyObject *self, const void *buf1, *buf2; Py_ssize_t len1, len2; PyObject* out; + len1 = PyUnicode_GET_LENGTH(self); + kind1 = PyUnicode_KIND(self); - if (maxcount < 0) - maxcount = PY_SSIZE_T_MAX; - - if (PyUnicode_READY(self) == -1) - return NULL; - - if (substring == NULL) - switch (PyUnicode_KIND(self)) { + if (substring == NULL) { + if (maxcount < 0) { + maxcount = (len1 - 1) / 2 + 1; + } + switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) return asciilib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); else return ucs1lib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); case PyUnicode_2BYTE_KIND: return ucs2lib_split_whitespace( self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); case PyUnicode_4BYTE_KIND: return ucs4lib_split_whitespace( self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); default: Py_UNREACHABLE(); } + } - if (PyUnicode_READY(substring) == -1) - return NULL; - - kind1 = PyUnicode_KIND(self); kind2 = PyUnicode_KIND(substring); - len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); + if (maxcount < 0) { + // if len2 == 0, it will raise ValueError. + maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; + // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1 + maxcount = maxcount < 0 ? len1 : maxcount; + } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); if (out == NULL) return NULL; - Py_INCREF(self); - PyList_SET_ITEM(out, 0, self); + PyList_SET_ITEM(out, 0, Py_NewRef(self)); return out; } buf1 = PyUnicode_DATA(self); @@ -10471,52 +9956,52 @@ rsplit(PyObject *self, Py_ssize_t len1, len2; PyObject* out; - if (maxcount < 0) - maxcount = PY_SSIZE_T_MAX; - - if (PyUnicode_READY(self) == -1) - return NULL; + len1 = PyUnicode_GET_LENGTH(self); + kind1 = PyUnicode_KIND(self); - if (substring == NULL) - switch (PyUnicode_KIND(self)) { + if (substring == NULL) { + if (maxcount < 0) { + maxcount = (len1 - 1) / 2 + 1; + } + switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) return asciilib_rsplit_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); else return ucs1lib_rsplit_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); case PyUnicode_2BYTE_KIND: return ucs2lib_rsplit_whitespace( self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); case PyUnicode_4BYTE_KIND: return ucs4lib_rsplit_whitespace( self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + len1, maxcount ); default: Py_UNREACHABLE(); } - - if (PyUnicode_READY(substring) == -1) - return NULL; - - kind1 = PyUnicode_KIND(self); + } kind2 = PyUnicode_KIND(substring); - len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); + if (maxcount < 0) { + // if len2 == 0, it will raise ValueError. + maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; + // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1 + maxcount = maxcount < 0 ? len1 : maxcount; + } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); if (out == NULL) return NULL; - Py_INCREF(self); - PyList_SET_ITEM(out, 0, self); + PyList_SET_ITEM(out, 0, Py_NewRef(self)); return out; } buf1 = PyUnicode_DATA(self); @@ -10577,10 +10062,7 @@ anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen, { switch (kind) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) - return asciilib_count(sbuf, slen, buf1, len1, maxcount); - else - return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); + return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); case PyUnicode_2BYTE_KIND: return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); case PyUnicode_4BYTE_KIND: @@ -10908,8 +10390,6 @@ static PyObject * unicode_title_impl(PyObject *self) /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; return case_operation(self, do_title); } @@ -10926,8 +10406,6 @@ static PyObject * unicode_capitalize_impl(PyObject *self) /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; if (PyUnicode_GET_LENGTH(self) == 0) return unicode_result_unchanged(self); return case_operation(self, do_capitalize); @@ -10943,8 +10421,6 @@ static PyObject * unicode_casefold_impl(PyObject *self) /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; if (PyUnicode_IS_ASCII(self)) return ascii_upper_or_lower(self, 1); return case_operation(self, do_casefold); @@ -10964,8 +10440,6 @@ convert_uc(PyObject *obj, void *addr) "not %.100s", Py_TYPE(obj)->tp_name); return 0; } - if (PyUnicode_READY(obj) < 0) - return 0; if (PyUnicode_GET_LENGTH(obj) != 1) { PyErr_SetString(PyExc_TypeError, "The fill character must be exactly one character long"); @@ -10993,9 +10467,6 @@ unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) { Py_ssize_t marg, left; - if (PyUnicode_READY(self) == -1) - return NULL; - if (PyUnicode_GET_LENGTH(self) >= width) return unicode_result_unchanged(self); @@ -11152,9 +10623,6 @@ _PyUnicode_Equal(PyObject *str1, PyObject *str2) if (str1 == str2) { return 1; } - if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) { - return -1; - } return unicode_compare_eq(str1, str2); } @@ -11163,10 +10631,6 @@ int PyUnicode_Compare(PyObject *left, PyObject *right) { if (PyUnicode_Check(left) && PyUnicode_Check(right)) { - if (PyUnicode_READY(left) == -1 || - PyUnicode_READY(right) == -1) - return -1; - /* a string is equal to itself */ if (left == right) return 0; @@ -11186,24 +10650,8 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) Py_ssize_t i; int kind; Py_UCS4 chr; - const unsigned char *ustr = (const unsigned char *)str; assert(_PyUnicode_CHECK(uni)); - if (!PyUnicode_IS_READY(uni)) { - const wchar_t *ws = _PyUnicode_WSTR(uni); - /* Compare Unicode string and source character set string */ - for (i = 0; (chr = ws[i]) && ustr[i]; i++) { - if (chr != ustr[i]) - return (chr < ustr[i]) ? -1 : 1; - } - /* This check keeps Python strings that end in '\0' from comparing equal - to C strings identical up to that point. */ - if (_PyUnicode_WSTR_LENGTH(uni) != i || chr) - return 1; /* uni is longer */ - if (ustr[i]) - return -1; /* str is longer */ - return 0; - } kind = PyUnicode_KIND(uni); if (kind == PyUnicode_1BYTE_KIND) { const void *data = PyUnicode_1BYTE_DATA(uni); @@ -11241,24 +10689,6 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) } } -static int -non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str) -{ - size_t i, len; - const wchar_t *p; - len = (size_t)_PyUnicode_WSTR_LENGTH(unicode); - if (strlen(str) != len) - return 0; - p = _PyUnicode_WSTR(unicode); - assert(p); - for (i = 0; i < len; i++) { - unsigned char c = (unsigned char)str[i]; - if (c >= 128 || p[i] != (wchar_t)c) - return 0; - } - return 1; -} - int _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) { @@ -11270,11 +10700,6 @@ _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) assert((unsigned char)*p < 128); } #endif - if (PyUnicode_READY(unicode) == -1) { - /* Memory error or bad data */ - PyErr_Clear(); - return non_ready_unicode_equal_to_ascii_string(unicode, str); - } if (!PyUnicode_IS_ASCII(unicode)) return 0; len = (size_t)PyUnicode_GET_LENGTH(unicode); @@ -11291,15 +10716,9 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) assert(right->string); #ifndef NDEBUG for (const char *p = right->string; *p; p++) { - assert((unsigned char)*p < 128); - } -#endif - - if (PyUnicode_READY(left) == -1) { - /* memory error or bad data */ - PyErr_Clear(); - return non_ready_unicode_equal_to_ascii_string(left, right->string); + assert((unsigned char)*p < 128); } +#endif if (!PyUnicode_IS_ASCII(left)) return 0; @@ -11334,10 +10753,6 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) Py_RETURN_NOTIMPLEMENTED; - if (PyUnicode_READY(left) == -1 || - PyUnicode_READY(right) == -1) - return NULL; - if (left == right) { switch (op) { case Py_EQ: @@ -11385,8 +10800,6 @@ PyUnicode_Contains(PyObject *str, PyObject *substr) Py_TYPE(substr)->tp_name); return -1; } - if (PyUnicode_READY(substr) == -1) - return -1; if (ensure_unicode(str) < 0) return -1; @@ -11450,8 +10863,6 @@ PyUnicode_Concat(PyObject *left, PyObject *right) Py_TYPE(right)->tp_name); return NULL; } - if (PyUnicode_READY(right) < 0) - return NULL; /* Shortcuts */ PyObject *empty = unicode_get_empty(); // Borrowed reference @@ -11505,17 +10916,11 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) goto error; } - if (PyUnicode_READY(left) == -1) - goto error; - if (PyUnicode_READY(right) == -1) - goto error; - /* Shortcuts */ PyObject *empty = unicode_get_empty(); // Borrowed reference if (left == empty) { Py_DECREF(left); - Py_INCREF(right); - *p_left = right; + *p_left = Py_NewRef(right); return; } if (right == empty) { @@ -11576,7 +10981,7 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) } /* -Wraps stringlib_parse_args_finds() and additionally ensures that the +Wraps asciilib_parse_args_finds() and additionally ensures that the first argument is a unicode object. */ @@ -11585,8 +10990,7 @@ parse_args_finds_unicode(const char * function_name, PyObject *args, PyObject **substring, Py_ssize_t *start, Py_ssize_t *end) { - if(stringlib_parse_args_finds(function_name, args, substring, - start, end)) { + if (asciilib_parse_args_finds(function_name, args, substring, start, end)) { if (ensure_unicode(*substring) < 0) return 0; return 1; @@ -11607,62 +11011,16 @@ unicode_count(PyObject *self, PyObject *args) PyObject *substring = NULL; /* initialize to fix a compiler warning */ Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *result; - int kind1, kind2; - const void *buf1, *buf2; - Py_ssize_t len1, len2, iresult; + Py_ssize_t result; if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) return NULL; - kind1 = PyUnicode_KIND(self); - kind2 = PyUnicode_KIND(substring); - if (kind1 < kind2) - return PyLong_FromLong(0); - - len1 = PyUnicode_GET_LENGTH(self); - len2 = PyUnicode_GET_LENGTH(substring); - ADJUST_INDICES(start, end, len1); - if (end - start < len2) - return PyLong_FromLong(0); - - buf1 = PyUnicode_DATA(self); - buf2 = PyUnicode_DATA(substring); - if (kind2 != kind1) { - buf2 = unicode_askind(kind2, buf2, len2, kind1); - if (!buf2) - return NULL; - } - switch (kind1) { - case PyUnicode_1BYTE_KIND: - iresult = ucs1lib_count( - ((const Py_UCS1*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - case PyUnicode_2BYTE_KIND: - iresult = ucs2lib_count( - ((const Py_UCS2*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - case PyUnicode_4BYTE_KIND: - iresult = ucs4lib_count( - ((const Py_UCS4*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); - break; - default: - Py_UNREACHABLE(); - } - - result = PyLong_FromSsize_t(iresult); - - assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring))); - if (kind2 != kind1) - PyMem_Free((void *)buf2); + result = unicode_count_impl(self, substring, start, end); + if (result == -1) + return NULL; - return result; + return PyLong_FromSsize_t(result); } /*[clinic input] @@ -11709,9 +11067,6 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) int kind; int found; - if (PyUnicode_READY(self) == -1) - return NULL; - /* First pass: determine size of output string */ src_len = PyUnicode_GET_LENGTH(self); i = j = line_pos = 0; @@ -11797,9 +11152,6 @@ unicode_find(PyObject *self, PyObject *args) if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) - return NULL; - result = any_find_slice(self, substring, start, end, 1); if (result == -2) @@ -11812,16 +11164,13 @@ static PyObject * unicode_getitem(PyObject *self, Py_ssize_t index) { const void *data; - enum PyUnicode_Kind kind; + int kind; Py_UCS4 ch; if (!PyUnicode_Check(self)) { PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(self) == -1) { - return NULL; - } if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; @@ -11844,8 +11193,6 @@ unicode_hash(PyObject *self) #endif if (_PyUnicode_HASH(self) != -1) return _PyUnicode_HASH(self); - if (PyUnicode_READY(self) == -1) - return -1; x = _Py_HashBytes(PyUnicode_DATA(self), PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); @@ -11874,9 +11221,6 @@ unicode_index(PyObject *self, PyObject *args) if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) - return NULL; - result = any_find_slice(self, substring, start, end, 1); if (result == -2) @@ -11903,9 +11247,6 @@ static PyObject * unicode_isascii_impl(PyObject *self) /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/ { - if (PyUnicode_READY(self) == -1) { - return NULL; - } return PyBool_FromLong(PyUnicode_IS_ASCII(self)); } @@ -11927,8 +11268,6 @@ unicode_islower_impl(PyObject *self) const void *data; int cased; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -11972,8 +11311,6 @@ unicode_isupper_impl(PyObject *self) const void *data; int cased; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12017,8 +11354,6 @@ unicode_istitle_impl(PyObject *self) const void *data; int cased, previous_is_cased; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12074,8 +11409,6 @@ unicode_isspace_impl(PyObject *self) int kind; const void *data; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12114,8 +11447,6 @@ unicode_isalpha_impl(PyObject *self) int kind; const void *data; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12153,9 +11484,6 @@ unicode_isalnum_impl(PyObject *self) const void *data; Py_ssize_t len, i; - if (PyUnicode_READY(self) == -1) - return NULL; - kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); len = PyUnicode_GET_LENGTH(self); @@ -12195,8 +11523,6 @@ unicode_isdecimal_impl(PyObject *self) int kind; const void *data; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12234,8 +11560,6 @@ unicode_isdigit_impl(PyObject *self) int kind; const void *data; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12274,8 +11598,6 @@ unicode_isnumeric_impl(PyObject *self) int kind; const void *data; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12300,9 +11622,6 @@ Py_ssize_t _PyUnicode_ScanIdentifier(PyObject *self) { Py_ssize_t i; - if (PyUnicode_READY(self) == -1) - return -1; - Py_ssize_t len = PyUnicode_GET_LENGTH(self); if (len == 0) { /* an empty string is not a valid identifier */ @@ -12336,54 +11655,10 @@ _PyUnicode_ScanIdentifier(PyObject *self) int PyUnicode_IsIdentifier(PyObject *self) { - if (PyUnicode_IS_READY(self)) { - Py_ssize_t i = _PyUnicode_ScanIdentifier(self); - Py_ssize_t len = PyUnicode_GET_LENGTH(self); - /* an empty string is not a valid identifier */ - return len && i == len; - } - else { -_Py_COMP_DIAG_PUSH -_Py_COMP_DIAG_IGNORE_DEPR_DECLS - Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self); - if (len == 0) { - /* an empty string is not a valid identifier */ - return 0; - } - - const wchar_t *wstr = _PyUnicode_WSTR(self); - Py_UCS4 ch = wstr[i++]; -#if SIZEOF_WCHAR_T == 2 - if (Py_UNICODE_IS_HIGH_SURROGATE(ch) - && i < len - && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) - { - ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); - i++; - } -#endif - if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { - return 0; - } - - while (i < len) { - ch = wstr[i++]; -#if SIZEOF_WCHAR_T == 2 - if (Py_UNICODE_IS_HIGH_SURROGATE(ch) - && i < len - && Py_UNICODE_IS_LOW_SURROGATE(wstr[i])) - { - ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]); - i++; - } -#endif - if (!_PyUnicode_IsXidContinue(ch)) { - return 0; - } - } - return 1; -_Py_COMP_DIAG_POP - } + Py_ssize_t i = _PyUnicode_ScanIdentifier(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + /* an empty string is not a valid identifier */ + return len && i == len; } /*[clinic input] @@ -12419,8 +11694,6 @@ unicode_isprintable_impl(PyObject *self) int kind; const void *data; - if (PyUnicode_READY(self) == -1) - return NULL; length = PyUnicode_GET_LENGTH(self); kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); @@ -12462,8 +11735,6 @@ unicode_join(PyObject *self, PyObject *iterable) static Py_ssize_t unicode_length(PyObject *self) { - if (PyUnicode_READY(self) == -1) - return -1; return PyUnicode_GET_LENGTH(self); } @@ -12483,9 +11754,6 @@ static PyObject * unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; - if (PyUnicode_GET_LENGTH(self) >= width) return unicode_result_unchanged(self); @@ -12502,8 +11770,6 @@ static PyObject * unicode_lower_impl(PyObject *self) /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; if (PyUnicode_IS_ASCII(self)) return ascii_upper_or_lower(self, 1); return case_operation(self, do_lower); @@ -12528,9 +11794,6 @@ _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) BLOOM_MASK sepmask; Py_ssize_t seplen; - if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) - return NULL; - kind = PyUnicode_KIND(self); data = PyUnicode_DATA(self); len = PyUnicode_GET_LENGTH(self); @@ -12576,9 +11839,6 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) int kind; Py_ssize_t length; - if (PyUnicode_READY(self) == -1) - return NULL; - length = PyUnicode_GET_LENGTH(self); end = Py_MIN(end, length); @@ -12611,9 +11871,6 @@ do_strip(PyObject *self, int striptype) { Py_ssize_t len, i, j; - if (PyUnicode_READY(self) == -1) - return NULL; - len = PyUnicode_GET_LENGTH(self); if (PyUnicode_IS_ASCII(self)) { @@ -12760,9 +12017,6 @@ unicode_repeat(PyObject *str, Py_ssize_t len) if (len == 1) return unicode_result_unchanged(str); - if (PyUnicode_READY(str) == -1) - return NULL; - if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { PyErr_SetString(PyExc_OverflowError, "repeated string is too long"); @@ -12837,8 +12091,6 @@ unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new, Py_ssize_t count) /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; return replace(self, old, new, count); } @@ -12908,9 +12160,6 @@ unicode_repr(PyObject *unicode) const void *idata; void *odata; - if (PyUnicode_READY(unicode) == -1) - return NULL; - isize = PyUnicode_GET_LENGTH(unicode); idata = PyUnicode_DATA(unicode); @@ -13083,9 +12332,6 @@ unicode_rfind(PyObject *self, PyObject *args) if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) - return NULL; - result = any_find_slice(self, substring, start, end, -1); if (result == -2) @@ -13115,9 +12361,6 @@ unicode_rindex(PyObject *self, PyObject *args) if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) - return NULL; - result = any_find_slice(self, substring, start, end, -1); if (result == -2) @@ -13147,9 +12390,6 @@ static PyObject * unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; - if (PyUnicode_GET_LENGTH(self) >= width) return unicode_result_unchanged(self); @@ -13384,7 +12624,7 @@ unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) /*[clinic input] str.splitlines as unicode_splitlines - keepends: bool(accept={int}) = False + keepends: bool = False Return a list of the lines in the string, breaking at line boundaries. @@ -13394,7 +12634,7 @@ true. static PyObject * unicode_splitlines_impl(PyObject *self, int keepends) -/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/ +/*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/ { return PyUnicode_Splitlines(self, keepends); } @@ -13415,8 +12655,6 @@ static PyObject * unicode_swapcase_impl(PyObject *self) /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; return case_operation(self, do_swapcase); } @@ -13582,8 +12820,6 @@ static PyObject * unicode_upper_impl(PyObject *self) /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/ { - if (PyUnicode_READY(self) == -1) - return NULL; if (PyUnicode_IS_ASCII(self)) return ascii_upper_or_lower(self, 0); return case_operation(self, do_upper); @@ -13610,9 +12846,6 @@ unicode_zfill_impl(PyObject *self, Py_ssize_t width) const void *data; Py_UCS4 chr; - if (PyUnicode_READY(self) == -1) - return NULL; - if (PyUnicode_GET_LENGTH(self) >= width) return unicode_result_unchanged(self); @@ -13655,7 +12888,7 @@ unicode_startswith(PyObject *self, Py_ssize_t end = PY_SSIZE_T_MAX; int result; - if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) + if (!asciilib_parse_args_finds("startswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -13709,7 +12942,7 @@ unicode_endswith(PyObject *self, Py_ssize_t end = PY_SSIZE_T_MAX; int result; - if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) + if (!asciilib_parse_args_finds("endswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -13756,7 +12989,7 @@ _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) else { /* use a value smaller than PyUnicode_1BYTE_KIND() so _PyUnicodeWriter_PrepareKind() will copy the buffer. */ - writer->kind = PyUnicode_WCHAR_KIND; + writer->kind = 0; assert(writer->kind <= PyUnicode_1BYTE_KIND); /* Copy-on-write mode: set buffer size to 0 so @@ -13776,7 +13009,7 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) /* use a value smaller than PyUnicode_1BYTE_KIND() so _PyUnicodeWriter_PrepareKind() will copy the buffer. */ - writer->kind = PyUnicode_WCHAR_KIND; + writer->kind = 0; assert(writer->kind <= PyUnicode_1BYTE_KIND); } @@ -13869,7 +13102,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, int _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, - enum PyUnicode_Kind kind) + int kind) { Py_UCS4 maxchar; @@ -13911,8 +13144,6 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) Py_UCS4 maxchar; Py_ssize_t len; - if (PyUnicode_READY(str) == -1) - return -1; len = PyUnicode_GET_LENGTH(str); if (len == 0) return 0; @@ -13921,8 +13152,7 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) if (writer->buffer == NULL && !writer->overallocate) { assert(_PyUnicode_CheckConsistency(str, 1)); writer->readonly = 1; - Py_INCREF(str); - writer->buffer = str; + writer->buffer = Py_NewRef(str); _PyUnicodeWriter_Update(writer); writer->pos += len; return 0; @@ -13943,9 +13173,6 @@ _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, Py_UCS4 maxchar; Py_ssize_t len; - if (PyUnicode_READY(str) == -1) - return -1; - assert(0 <= start); assert(end <= PyUnicode_GET_LENGTH(str)); assert(start <= end); @@ -14074,7 +13301,7 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) } assert(_PyUnicode_CheckConsistency(str, 1)); - return unicode_result_ready(str); + return unicode_result(str); } void @@ -14113,8 +13340,6 @@ unicode___format___impl(PyObject *self, PyObject *format_spec) _PyUnicodeWriter writer; int ret; - if (PyUnicode_READY(self) == -1) - return NULL; _PyUnicodeWriter_Init(&writer); ret = _PyUnicode_FormatAdvancedWriter(&writer, self, format_spec, 0, @@ -14140,11 +13365,13 @@ unicode_sizeof_impl(PyObject *self) /* If it's a compact object, account for base structure + character data. */ - if (PyUnicode_IS_COMPACT_ASCII(self)) + if (PyUnicode_IS_COMPACT_ASCII(self)) { size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1; - else if (PyUnicode_IS_COMPACT(self)) + } + else if (PyUnicode_IS_COMPACT(self)) { size = sizeof(PyCompactUnicodeObject) + (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self); + } else { /* If it is a two-block object, account for base object, and for character block if present. */ @@ -14153,10 +13380,6 @@ unicode_sizeof_impl(PyObject *self) size += (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self); } - /* If the wstr pointer is present, account for it unless it is shared - with the data pointer. Check if the data is not shared. */ - if (_PyUnicode_HAS_WSTR_MEMORY(self)) - size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t); if (_PyUnicode_HAS_UTF8_MEMORY(self)) size += PyUnicode_UTF8_LENGTH(self) + 1; @@ -14255,9 +13478,6 @@ static PySequenceMethods unicode_as_sequence = { static PyObject* unicode_subscript(PyObject* self, PyObject* item) { - if (PyUnicode_READY(self) == -1) - return NULL; - if (_PyIndex_Check(item)) { Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); if (i == -1 && PyErr_Occurred()) @@ -14340,7 +13560,7 @@ struct unicode_formatter_t { Py_ssize_t arglen, argidx; PyObject *dict; - enum PyUnicode_Kind fmtkind; + int fmtkind; Py_ssize_t fmtcnt, fmtpos; const void *fmtdata; PyObject *fmtstr; @@ -14479,7 +13699,6 @@ _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) return NULL; assert(unicode_modifiable(result)); - assert(PyUnicode_IS_READY(result)); assert(PyUnicode_IS_ASCII(result)); /* To modify the string in-place, there can only be one reference. */ @@ -14534,8 +13753,7 @@ _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) for (i = 0; i < numdigits; i++) *b1++ = *buf++; *b1 = '\0'; - Py_DECREF(result); - result = r1; + Py_SETREF(result, r1); buf = PyBytes_AS_STRING(result); len = numnondigits + prec; } @@ -14552,8 +13770,7 @@ _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) || buf != PyUnicode_DATA(result)) { PyObject *unicode; unicode = _PyUnicode_FromASCII(buf, len); - Py_DECREF(result); - result = unicode; + Py_SETREF(result, unicode); } else if (len != PyUnicode_GET_LENGTH(result)) { if (PyUnicode_Resize(&result, len) < 0) @@ -14594,8 +13811,7 @@ mainformatlong(PyObject *v, assert(PyLong_Check(iobj)); } else { - iobj = v; - Py_INCREF(iobj); + iobj = Py_NewRef(v); } if (PyLong_CheckExact(v) @@ -14918,8 +14134,7 @@ unicode_format_arg_format(struct unicode_formatter_t *ctx, } if (PyUnicode_CheckExact(v) && arg->ch == 's') { - *p_str = v; - Py_INCREF(*p_str); + *p_str = Py_NewRef(v); } else { if (arg->ch == 's') @@ -15001,7 +14216,7 @@ unicode_format_arg_output(struct unicode_formatter_t *ctx, PyObject *str) { Py_ssize_t len; - enum PyUnicode_Kind kind; + int kind; const void *pbuf; Py_ssize_t pindex; Py_UCS4 signchar; @@ -15015,9 +14230,6 @@ unicode_format_arg_output(struct unicode_formatter_t *ctx, if (arg->sign && arg->flags & F_ZERO) fill = '0'; - if (PyUnicode_READY(str) == -1) - return -1; - len = PyUnicode_GET_LENGTH(str); if ((arg->width == -1 || arg->width <= len) && (arg->prec == -1 || arg->prec >= len) @@ -15319,15 +14531,12 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) { PyObject *self; Py_ssize_t length, char_size; - int share_wstr, share_utf8; - unsigned int kind; + int share_utf8; + int kind; void *data; assert(PyType_IsSubtype(type, &PyUnicode_Type)); assert(_PyUnicode_CHECK(unicode)); - if (PyUnicode_READY(unicode) == -1) { - return NULL; - } self = type->tp_alloc(type, 0); if (self == NULL) { @@ -15346,15 +14555,12 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) _PyUnicode_STATE(self).kind = kind; _PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; - _PyUnicode_STATE(self).ready = 1; - _PyUnicode_WSTR(self) = NULL; + _PyUnicode_STATE(self).statically_allocated = 0; _PyUnicode_UTF8_LENGTH(self) = 0; _PyUnicode_UTF8(self) = NULL; - _PyUnicode_WSTR_LENGTH(self) = 0; _PyUnicode_DATA_ANY(self) = NULL; share_utf8 = 0; - share_wstr = 0; if (kind == PyUnicode_1BYTE_KIND) { char_size = 1; if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) @@ -15362,14 +14568,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) } else if (kind == PyUnicode_2BYTE_KIND) { char_size = 2; - if (sizeof(wchar_t) == 2) - share_wstr = 1; } else { assert(kind == PyUnicode_4BYTE_KIND); char_size = 4; - if (sizeof(wchar_t) == 4) - share_wstr = 1; } /* Ensure we won't overflow the length. */ @@ -15388,13 +14590,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) _PyUnicode_UTF8_LENGTH(self) = length; _PyUnicode_UTF8(self) = data; } - if (share_wstr) { - _PyUnicode_WSTR_LENGTH(self) = length; - _PyUnicode_WSTR(self) = (wchar_t *)data; - } - memcpy(data, PyUnicode_DATA(unicode), - kind * (length + 1)); + memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1)); assert(_PyUnicode_CheckConsistency(self, 1)); #ifdef Py_DEBUG _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); @@ -15474,12 +14671,14 @@ PyTypeObject PyUnicode_Type = { /* Initialize the Unicode implementation */ -void -_PyUnicode_InitState(PyInterpreterState *interp) +static void +_init_global_state(void) { - if (!_Py_IsMainInterpreter(interp)) { + static int initialized = 0; + if (initialized) { return; } + initialized = 1; /* initialize the linebreak bloom filter */ const Py_UCS2 linebreak[] = { @@ -15497,21 +14696,42 @@ _PyUnicode_InitState(PyInterpreterState *interp) Py_ARRAY_LENGTH(linebreak)); } +void +_PyUnicode_InitState(PyInterpreterState *interp) +{ + if (!_Py_IsMainInterpreter(interp)) { + return; + } + _init_global_state(); +} + PyStatus _PyUnicode_InitGlobalObjects(PyInterpreterState *interp) { - if (!_Py_IsMainInterpreter(interp)) { - return _PyStatus_OK(); + // Initialize the global interned dict + if (init_interned_dict(interp)) { + PyErr_Clear(); + return _PyStatus_ERR("failed to create interned dict"); } + if (_Py_IsMainInterpreter(interp)) { + /* Intern statically allocated string identifiers and deepfreeze strings. + * This must be done before any module initialization so that statically + * allocated string identifiers are used instead of heap allocated strings. + * Deepfreeze uses the interned identifiers if present to save space + * else generates them and they are interned to speed up dict lookups. + */ + _PyUnicode_InitStaticStrings(interp); + #ifdef Py_DEBUG - assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1)); + assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1)); - for (int i = 0; i < 256; i++) { - assert(_PyUnicode_CheckConsistency(LATIN1(i), 1)); - } + for (int i = 0; i < 256; i++) { + assert(_PyUnicode_CheckConsistency(LATIN1(i), 1)); + } #endif + } return _PyStatus_OK(); } @@ -15520,17 +14740,13 @@ _PyUnicode_InitGlobalObjects(PyInterpreterState *interp) PyStatus _PyUnicode_InitTypes(PyInterpreterState *interp) { - if (!_Py_IsMainInterpreter(interp)) { - return _PyStatus_OK(); - } - - if (PyType_Ready(&EncodingMapType) < 0) { + if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) { goto error; } - if (PyType_Ready(&PyFieldNameIter_Type) < 0) { + if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) { goto error; } - if (PyType_Ready(&PyFormatterIter_Type) < 0) { + if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) { goto error; } return _PyStatus_OK(); @@ -15541,7 +14757,7 @@ error: void -PyUnicode_InternInPlace(PyObject **p) +_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p) { PyObject *s = *p; #ifdef Py_DEBUG @@ -15563,19 +14779,26 @@ PyUnicode_InternInPlace(PyObject **p) return; } - if (PyUnicode_READY(s) == -1) { - PyErr_Clear(); + /* Look in the global cache first. */ + PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s); + if (r != NULL && r != s) { + Py_SETREF(*p, Py_NewRef(r)); return; } - if (interned == NULL) { - interned = PyDict_New(); - if (interned == NULL) { - PyErr_Clear(); /* Don't leave an exception */ - return; + /* Handle statically allocated strings. */ + if (_PyUnicode_STATE(s).statically_allocated) { + assert(_Py_IsImmortal(s)); + if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) { + _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC; } + return; } + /* Look in the per-interpreter cache. */ + PyObject *interned = get_interned_dict(interp); + assert(interned != NULL); + PyObject *t = PyDict_SetDefault(interned, s, s); if (t == NULL) { PyErr_Clear(); @@ -15583,35 +14806,44 @@ PyUnicode_InternInPlace(PyObject **p) } if (t != s) { - Py_INCREF(t); - Py_SETREF(*p, t); + Py_SETREF(*p, Py_NewRef(t)); + return; + } + + if (_Py_IsImmortal(s)) { + // XXX Restrict this to the main interpreter? + _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC; return; } - /* The two references in interned dict (key and value) are not counted by - refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of - this. */ - Py_SET_REFCNT(s, Py_REFCNT(s) - 2); - _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; +#ifdef Py_REF_DEBUG + /* The reference count value excluding the 2 references from the + interned dictionary should be excluded from the RefTotal. The + decrements to these objects will not be registered so they + need to be accounted for in here. */ + for (Py_ssize_t i = 0; i < Py_REFCNT(s) - 2; i++) { + _Py_DecRefTotal(_PyInterpreterState_GET()); + } +#endif + _Py_SetImmortal(s); + _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; } void -PyUnicode_InternImmortal(PyObject **p) +PyUnicode_InternInPlace(PyObject **p) { - if (PyErr_WarnEx(PyExc_DeprecationWarning, - "PyUnicode_InternImmortal() is deprecated; " - "use PyUnicode_InternInPlace() instead", 1) < 0) - { - // The function has no return value, the exception cannot - // be reported to the caller, so just log it. - PyErr_WriteUnraisable(NULL); - } + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyUnicode_InternInPlace(interp, p); +} +// Function kept for the stable ABI. +PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); +void +PyUnicode_InternImmortal(PyObject **p) +{ PyUnicode_InternInPlace(p); - if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { - _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; - Py_INCREF(*p); - } + // Leak a reference on purpose + Py_INCREF(*p); } PyObject * @@ -15628,61 +14860,81 @@ PyUnicode_InternFromString(const char *cp) void _PyUnicode_ClearInterned(PyInterpreterState *interp) { - if (!_Py_IsMainInterpreter(interp)) { - // interned dict is shared by all interpreters - return; - } - + PyObject *interned = get_interned_dict(interp); if (interned == NULL) { return; } assert(PyDict_CheckExact(interned)); - /* Interned unicode strings are not forcibly deallocated; rather, we give - them their stolen references back, and then clear and DECREF the - interned dict. */ - + /* TODO: + * Currently, the runtime is not able to guarantee that it can exit without + * allocations that carry over to a future initialization of Python within + * the same process. i.e: + * ./python -X showrefcount -c 'import itertools' + * [237 refs, 237 blocks] + * + * Therefore, this should remain disabled for until there is a strict guarantee + * that no memory will be left after `Py_Finalize`. + */ +#ifdef Py_DEBUG + /* For all non-singleton interned strings, restore the two valid references + to that instance from within the intern string dictionary and let the + normal reference counting process clean up these instances. */ #ifdef INTERNED_STATS fprintf(stderr, "releasing %zd interned strings\n", PyDict_GET_SIZE(interned)); - Py_ssize_t immortal_size = 0, mortal_size = 0; + Py_ssize_t total_length = 0; #endif Py_ssize_t pos = 0; PyObject *s, *ignored_value; while (PyDict_Next(interned, &pos, &s, &ignored_value)) { assert(PyUnicode_IS_READY(s)); - + int shared = 0; switch (PyUnicode_CHECK_INTERNED(s)) { case SSTATE_INTERNED_IMMORTAL: - Py_SET_REFCNT(s, Py_REFCNT(s) + 1); -#ifdef INTERNED_STATS - immortal_size += PyUnicode_GET_LENGTH(s); -#endif - break; - case SSTATE_INTERNED_MORTAL: - // Restore the two references (key and value) ignored + // Skip the Immortal Instance check and restore + // the two references (key and value) ignored // by PyUnicode_InternInPlace(). - Py_SET_REFCNT(s, Py_REFCNT(s) + 2); + s->ob_refcnt = 2; #ifdef INTERNED_STATS - mortal_size += PyUnicode_GET_LENGTH(s); + total_length += PyUnicode_GET_LENGTH(s); #endif break; + case SSTATE_INTERNED_IMMORTAL_STATIC: + /* It is shared between interpreters, so we should unmark it + only when this is the last interpreter in which it's + interned. We immortalize all the statically initialized + strings during startup, so we can rely on the + main interpreter to be the last one. */ + if (!_Py_IsMainInterpreter(interp)) { + shared = 1; + } + break; + case SSTATE_INTERNED_MORTAL: + /* fall through */ case SSTATE_NOT_INTERNED: /* fall through */ default: Py_UNREACHABLE(); } - _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; + if (!shared) { + _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; + } } #ifdef INTERNED_STATS fprintf(stderr, - "total size of all interned strings: %zd/%zd mortal/immortal\n", - mortal_size, immortal_size); + "total length of all interned strings: %zd characters\n", + total_length); #endif - PyDict_Clear(interned); - Py_CLEAR(interned); + struct _Py_unicode_state *state = &interp->unicode; + struct _Py_unicode_ids *ids = &state->ids; + for (Py_ssize_t i=0; i < ids->size; i++) { + Py_XINCREF(ids->array[i]); + } +#endif /* Py_DEBUG */ + clear_interned_dict(interp); } @@ -15779,7 +15031,7 @@ unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored)) if (it->it_seq != NULL) { return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index); } else { - PyObject *u = (PyObject *)_PyUnicode_New(0); + PyObject *u = unicode_new_empty(); if (u == NULL) { Py_XDECREF(iter); return NULL; @@ -15873,8 +15125,6 @@ unicode_iter(PyObject *seq) PyErr_BadInternalCall(); return NULL; } - if (PyUnicode_READY(seq) == -1) - return NULL; if (PyUnicode_IS_COMPACT_ASCII(seq)) { it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type); } @@ -15884,8 +15134,7 @@ unicode_iter(PyObject *seq) if (it == NULL) return NULL; it->it_index = 0; - Py_INCREF(seq); - it->it_seq = seq; + it->it_seq = Py_NewRef(seq); _PyObject_GC_TRACK(it); return (PyObject *)it; } @@ -16012,10 +15261,13 @@ init_fs_codec(PyInterpreterState *interp) /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors global configuration variables. */ - if (_Py_SetFileSystemEncoding(fs_codec->encoding, - fs_codec->errors) < 0) { - PyErr_NoMemory(); - return -1; + if (_Py_IsMainInterpreter(interp)) { + + if (_Py_SetFileSystemEncoding(fs_codec->encoding, + fs_codec->errors) < 0) { + PyErr_NoMemory(); + return -1; + } } return 0; } @@ -16098,7 +15350,7 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void) static inline int unicode_is_finalizing(void) { - return (interned == NULL); + return (get_interned_dict(_PyInterpreterState_Main()) == NULL); } #endif @@ -16106,42 +15358,9 @@ unicode_is_finalizing(void) void _PyUnicode_FiniTypes(PyInterpreterState *interp) { - if (!_Py_IsMainInterpreter(interp)) { - return; - } - - _PyStaticType_Dealloc(&EncodingMapType); - _PyStaticType_Dealloc(&PyFieldNameIter_Type); - _PyStaticType_Dealloc(&PyFormatterIter_Type); -} - - -static void unicode_static_dealloc(PyObject *op) -{ - PyASCIIObject *ascii = _PyASCIIObject_CAST(op); - - assert(ascii->state.compact); - - if (ascii->state.ascii) { - if (ascii->wstr) { - PyObject_Free(ascii->wstr); - ascii->wstr = NULL; - } - } - else { - PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op; - void* data = (void*)(compact + 1); - if (ascii->wstr && ascii->wstr != data) { - PyObject_Free(ascii->wstr); - ascii->wstr = NULL; - compact->wstr_length = 0; - } - if (compact->utf8) { - PyObject_Free(compact->utf8); - compact->utf8 = NULL; - compact->utf8_length = 0; - } - } + _PyStaticType_Dealloc(interp, &EncodingMapType); + _PyStaticType_Dealloc(interp, &PyFieldNameIter_Type); + _PyStaticType_Dealloc(interp, &PyFormatterIter_Type); } @@ -16150,35 +15369,18 @@ _PyUnicode_Fini(PyInterpreterState *interp) { struct _Py_unicode_state *state = &interp->unicode; - if (_Py_IsMainInterpreter(interp)) { - // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini() - assert(interned == NULL); - // bpo-47182: force a unicodedata CAPI capsule re-import on - // subsequent initialization of main interpreter. - ucnhash_capi = NULL; - } + // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini() + assert(get_interned_dict(interp) == NULL); _PyUnicode_FiniEncodings(&state->fs_codec); - unicode_clear_identifiers(state); - - // Clear the single character singletons - for (int i = 0; i < 128; i++) { - unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]); - } - for (int i = 0; i < 128; i++) { - unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]); - } -} - + // bpo-47182: force a unicodedata CAPI capsule re-import on + // subsequent initialization of interpreter. + interp->unicode.ucnhash_capi = NULL; -void -_PyStaticUnicode_Dealloc(PyObject *op) -{ - unicode_static_dealloc(op); + unicode_clear_identifiers(state); } - /* A _string module, to export formatter_parser and formatter_field_name_split to the string.Formatter class implemented in Python. */ @@ -16190,12 +15392,18 @@ static PyMethodDef _string_methods[] = { {NULL, NULL} }; +static PyModuleDef_Slot module_slots[] = { + {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED}, + {0, NULL} +}; + static struct PyModuleDef _string_module = { PyModuleDef_HEAD_INIT, .m_name = "_string", .m_doc = PyDoc_STR("string helper module"), .m_size = 0, .m_methods = _string_methods, + .m_slots = module_slots, }; PyMODINIT_FUNC -- cgit v1.3