diff options
| author | shadchin <[email protected]> | 2022-04-18 12:39:32 +0300 |
|---|---|---|
| committer | shadchin <[email protected]> | 2022-04-18 12:39:32 +0300 |
| commit | d4be68e361f4258cf0848fc70018dfe37a2acc24 (patch) | |
| tree | 153e294cd97ac8b5d7a989612704a0c1f58e8ad4 /contrib/tools/python3/src/Objects/unicodeobject.c | |
| parent | 260c02f5ccf242d9d9b8a873afaf6588c00237d6 (diff) | |
IGNIETFERRO-1816 Update Python 3 from 3.9.12 to 3.10.4
ref:9f96be6d02ee8044fdd6f124b799b270c20ce641
Diffstat (limited to 'contrib/tools/python3/src/Objects/unicodeobject.c')
| -rw-r--r-- | contrib/tools/python3/src/Objects/unicodeobject.c | 1115 |
1 files changed, 578 insertions, 537 deletions
diff --git a/contrib/tools/python3/src/Objects/unicodeobject.c b/contrib/tools/python3/src/Objects/unicodeobject.c index 7767d140e6c..077cf8d7f45 100644 --- a/contrib/tools/python3/src/Objects/unicodeobject.c +++ b/contrib/tools/python3/src/Objects/unicodeobject.c @@ -40,17 +40,18 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #define PY_SSIZE_T_CLEAN #include "Python.h" -#include "pycore_abstract.h" // _PyIndex_Check() -#include "pycore_bytes_methods.h" -#include "pycore_fileutils.h" -#include "pycore_initconfig.h" -#include "pycore_interp.h" // PyInterpreterState.fs_codec -#include "pycore_object.h" -#include "pycore_pathconfig.h" -#include "pycore_pylifecycle.h" -#include "pycore_pystate.h" // _PyInterpreterState_GET() -#include "ucnhash.h" -#include "stringlib/eq.h" +#include "pycore_abstract.h" // _PyIndex_Check() +#include "pycore_atomic_funcs.h" // _Py_atomic_size_get() +#include "pycore_bytes_methods.h" // _Py_bytes_lower() +#include "pycore_format.h" // F_LJUST +#include "pycore_initconfig.h" // _PyStatus_OK() +#include "pycore_interp.h" // PyInterpreterState.fs_codec +#include "pycore_object.h" // _PyObject_GC_TRACK() +#include "pycore_pathconfig.h" // _Py_DumpPathConfig() +#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding() +#include "pycore_pystate.h" // _PyInterpreterState_GET() +#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "stringlib/eq.h" // unicode_eq() #ifdef MS_WINDOWS #include <windows.h> @@ -60,8 +61,8 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar() #endif -/* Uncomment to display statistics on interned strings at exit when - using Valgrind or Insecure++. */ +/* Uncomment to display statistics on interned strings at exit + in _PyUnicode_ClearInterned(). */ /* #define INTERNED_STATS 1 */ @@ -210,7 +211,10 @@ extern "C" { # define OVERALLOCATE_FACTOR 4 #endif -#define INTERNED_STRINGS +/* bpo-40521: Interned strings are shared by all interpreters. */ +#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS +# define INTERNED_STRINGS +#endif /* This dictionary holds all interned unicode strings. Note that references to strings in this dictionary are *not* counted in the string's ob_refcnt. @@ -224,26 +228,36 @@ extern "C" { static PyObject *interned = NULL; #endif -/* The empty Unicode object is shared to improve performance. */ -static PyObject *unicode_empty = NULL; +static struct _Py_unicode_state* +get_unicode_state(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + return &interp->unicode; +} -#define _Py_INCREF_UNICODE_EMPTY() \ - do { \ - if (unicode_empty != NULL) \ - Py_INCREF(unicode_empty); \ - else { \ - unicode_empty = PyUnicode_New(0, 0); \ - if (unicode_empty != NULL) { \ - Py_INCREF(unicode_empty); \ - assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ - } \ - } \ - } while (0) -#define _Py_RETURN_UNICODE_EMPTY() \ - do { \ - _Py_INCREF_UNICODE_EMPTY(); \ - return unicode_empty; \ +// Return a borrowed reference to the empty string singleton. +static inline PyObject* unicode_get_empty(void) +{ + struct _Py_unicode_state *state = get_unicode_state(); + // unicode_get_empty() must not be called before _PyUnicode_Init() + // or after _PyUnicode_Fini() + assert(state->empty_string != NULL); + return state->empty_string; +} + + +// Return a strong reference to the empty string singleton. +static inline PyObject* unicode_new_empty(void) +{ + PyObject *empty = unicode_get_empty(); + Py_INCREF(empty); + return empty; +} + +#define _Py_RETURN_UNICODE_EMPTY() \ + do { \ + return unicode_new_empty(); \ } while (0) static inline void @@ -294,17 +308,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed); -/* List of static strings. */ -static _Py_Identifier *static_strings = NULL; - -#define LATIN1_SINGLETONS - -#ifdef LATIN1_SINGLETONS -/* Single character Unicode strings in the Latin-1 range are being - shared as well. */ -static PyObject *unicode_latin1[256] = {NULL}; -#endif - /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -498,20 +501,6 @@ unicode_check_encoding_errors(const char *encoding, const char *errors) } -/* The max unicode value is always 0x10FFFF while using the PEP-393 API. - This function is kept for backward compatibility with the old API. */ -Py_UNICODE -PyUnicode_GetMax(void) -{ -#ifdef Py_UNICODE_WIDE - return 0x10FFFF; -#else - /* This is actually an illegal character, so it should - not be passed to unichr. */ - return 0xFFFF; -#endif -} - int _PyUnicode_CheckConsistency(PyObject *op, int check_content) { @@ -648,9 +637,8 @@ unicode_result_wchar(PyObject *unicode) if (len == 1) { wchar_t ch = _PyUnicode_WSTR(unicode)[0]; if ((Py_UCS4)ch < 256) { - PyObject *latin1_char = get_latin1_char((unsigned char)ch); Py_DECREF(unicode); - return latin1_char; + return get_latin1_char((unsigned char)ch); } } @@ -675,20 +663,21 @@ unicode_result_ready(PyObject *unicode) length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { - if (unicode != unicode_empty) { + PyObject *empty = unicode_get_empty(); + if (unicode != empty) { Py_DECREF(unicode); - _Py_RETURN_UNICODE_EMPTY(); + Py_INCREF(empty); } - return unicode_empty; + return empty; } -#ifdef LATIN1_SINGLETONS if (length == 1) { - const void *data = PyUnicode_DATA(unicode); int kind = PyUnicode_KIND(unicode); - Py_UCS4 ch = PyUnicode_READ(kind, data, 0); - if (ch < 256) { - PyObject *latin1_char = unicode_latin1[ch]; + if (kind == PyUnicode_1BYTE_KIND) { + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); + Py_UCS1 ch = data[0]; + struct _Py_unicode_state *state = get_unicode_state(); + PyObject *latin1_char = state->latin1[ch]; if (latin1_char != NULL) { if (unicode != latin1_char) { Py_INCREF(latin1_char); @@ -699,12 +688,14 @@ unicode_result_ready(PyObject *unicode) else { assert(_PyUnicode_CheckConsistency(unicode, 1)); Py_INCREF(unicode); - unicode_latin1[ch] = unicode; + state->latin1[ch] = unicode; return unicode; } } + else { + assert(PyUnicode_READ_CHAR(unicode, 0) >= 256); + } } -#endif assert(_PyUnicode_CheckConsistency(unicode, 1)); return unicode; @@ -867,7 +858,7 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str, to keep things simple, we use a single bitmask, using the least 5 bits from each unicode characters as the bit index. */ -/* the linebreak mask is set up by Unicode_Init below */ +/* the linebreak mask is set up by _PyUnicode_Init() below */ #if LONG_BIT >= 128 #define BLOOM_WIDTH 128 @@ -941,6 +932,8 @@ ensure_unicode(PyObject *obj) /* Compilation of templated routines */ +#define STRINGLIB_GET_EMPTY() unicode_get_empty() + #include "stringlib/asciilib.h" #include "stringlib/fastsearch.h" #include "stringlib/partition.h" @@ -989,6 +982,8 @@ _Py_COMP_DIAG_IGNORE_DEPR_DECLS #include "stringlib/undef.h" _Py_COMP_DIAG_POP +#undef STRINGLIB_GET_EMPTY + /* --- Unicode Object ----------------------------------------------------- */ static inline Py_ssize_t @@ -1070,7 +1065,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) new_size = (struct_size + (length + 1) * char_size); if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_UTF8(unicode)); + PyObject_Free(_PyUnicode_UTF8(unicode)); _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } @@ -1081,7 +1076,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) _Py_ForgetReference(unicode); #endif - new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); + new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size); if (new_unicode == NULL) { _Py_NewReference(unicode); PyErr_NoMemory(); @@ -1097,7 +1092,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) _PyUnicode_WSTR_LENGTH(unicode) = length; } else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; if (!PyUnicode_IS_ASCII(unicode)) _PyUnicode_WSTR_LENGTH(unicode) = 0; @@ -1140,12 +1135,12 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_UTF8(unicode)); + PyObject_Free(_PyUnicode_UTF8(unicode)); _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } - data = (PyObject *)PyObject_REALLOC(data, new_size); + data = (PyObject *)PyObject_Realloc(data, new_size); if (data == NULL) { PyErr_NoMemory(); return -1; @@ -1178,7 +1173,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) } new_size = sizeof(wchar_t) * (length + 1); wstr = _PyUnicode_WSTR(unicode); - wstr = PyObject_REALLOC(wstr, new_size); + wstr = PyObject_Realloc(wstr, new_size); if (!wstr) { PyErr_NoMemory(); return -1; @@ -1237,9 +1232,8 @@ _PyUnicode_New(Py_ssize_t length) size_t new_size; /* Optimization for empty strings */ - if (length == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return (PyUnicodeObject*)unicode_empty; + if (length == 0) { + return (PyUnicodeObject *)unicode_new_empty(); } /* Ensure we won't overflow the size. */ @@ -1269,7 +1263,7 @@ _PyUnicode_New(Py_ssize_t length) _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; - _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); + _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size); if (!_PyUnicode_WSTR(unicode)) { Py_DECREF(unicode); PyErr_NoMemory(); @@ -1369,27 +1363,51 @@ _PyUnicode_Dump(PyObject *op) } else data = unicode->data.any; - printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", - unicode_kind_name(op), ascii->length); + printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length); if (ascii->wstr == data) printf("shared "); printf("wstr=%p", (void *)ascii->wstr); if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { - printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); - if (!ascii->state.compact && compact->utf8 == unicode->data.any) + printf(" (%zu), ", compact->wstr_length); + if (!ascii->state.compact && compact->utf8 == unicode->data.any) { printf("shared "); - printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", - (void *)compact->utf8, compact->utf8_length); + } + printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length); } printf(", data=%p\n", data); } #endif +static int +unicode_create_empty_string_singleton(struct _Py_unicode_state *state) +{ + // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be + // optimized to always use state->empty_string without having to check if + // it is NULL or not. + PyObject *empty = PyUnicode_New(1, 0); + if (empty == NULL) { + return -1; + } + PyUnicode_1BYTE_DATA(empty)[0] = 0; + _PyUnicode_LENGTH(empty) = 0; + assert(_PyUnicode_CheckConsistency(empty, 1)); + + assert(state->empty_string == NULL); + state->empty_string = empty; + return 0; +} + + PyObject * PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) { + /* Optimization for empty strings */ + if (size == 0) { + return unicode_new_empty(); + } + PyObject *obj; PyCompactUnicodeObject *unicode; void *data; @@ -1398,12 +1416,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) Py_ssize_t char_size; Py_ssize_t struct_size; - /* Optimization for empty strings */ - if (size == 0 && unicode_empty != NULL) { - Py_INCREF(unicode_empty); - return unicode_empty; - } - is_ascii = 0; is_sharing = 0; struct_size = sizeof(PyCompactUnicodeObject); @@ -1448,12 +1460,11 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) * PyObject_New() so we are able to allocate space for the object and * it's data buffer. */ - obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); - if (obj == NULL) + obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size); + if (obj == NULL) { return PyErr_NoMemory(); - obj = PyObject_INIT(obj, &PyUnicode_Type); - if (obj == NULL) - return NULL; + } + _PyObject_Init(obj, &PyUnicode_Type); unicode = (PyCompactUnicodeObject *)obj; if (is_ascii) @@ -1831,7 +1842,7 @@ _PyUnicode_Ready(PyObject *unicode) return -1; if (maxchar < 256) { - _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); + _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1); if (!_PyUnicode_DATA_ANY(unicode)) { PyErr_NoMemory(); return -1; @@ -1852,7 +1863,7 @@ _PyUnicode_Ready(PyObject *unicode) _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; } - PyObject_FREE(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; } @@ -1872,7 +1883,7 @@ _PyUnicode_Ready(PyObject *unicode) _PyUnicode_UTF8_LENGTH(unicode) = 0; #else /* sizeof(wchar_t) == 4 */ - _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( + _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc( 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); if (!_PyUnicode_DATA_ANY(unicode)) { PyErr_NoMemory(); @@ -1886,7 +1897,7 @@ _PyUnicode_Ready(PyObject *unicode) _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; _PyUnicode_UTF8(unicode) = NULL; _PyUnicode_UTF8_LENGTH(unicode) = 0; - PyObject_FREE(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; #endif @@ -1901,7 +1912,7 @@ _PyUnicode_Ready(PyObject *unicode) PyErr_NoMemory(); return -1; } - _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); + _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1)); if (!_PyUnicode_DATA_ANY(unicode)) { PyErr_NoMemory(); return -1; @@ -1913,7 +1924,7 @@ _PyUnicode_Ready(PyObject *unicode) /* unicode_convert_wchar_to_ucs4() requires a ready string */ _PyUnicode_STATE(unicode).ready = 1; unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); - PyObject_FREE(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; #else @@ -1940,15 +1951,24 @@ unicode_dealloc(PyObject *unicode) break; case SSTATE_INTERNED_MORTAL: - /* revive dead object temporarily for DelItem */ - Py_SET_REFCNT(unicode, 3); + { #ifdef INTERNED_STRINGS + /* Revive the dead object temporarily. PyDict_DelItem() removes two + references (key and value) which were ignored by + PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2 + to prevent calling unicode_dealloc() again. Adjust refcnt after + PyDict_DelItem(). */ + assert(Py_REFCNT(unicode) == 0); + Py_SET_REFCNT(unicode, 3); if (PyDict_DelItem(interned, unicode) != 0) { _PyErr_WriteUnraisableMsg("deletion of interned string failed", NULL); } + assert(Py_REFCNT(unicode) == 1); + Py_SET_REFCNT(unicode, 0); #endif break; + } case SSTATE_INTERNED_IMMORTAL: _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died"); @@ -1959,13 +1979,13 @@ unicode_dealloc(PyObject *unicode) } if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_WSTR(unicode)); + PyObject_Free(_PyUnicode_WSTR(unicode)); } if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { - PyObject_DEL(_PyUnicode_UTF8(unicode)); + PyObject_Free(_PyUnicode_UTF8(unicode)); } if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) { - PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); + PyObject_Free(_PyUnicode_DATA_ANY(unicode)); } Py_TYPE(unicode)->tp_free(unicode); @@ -1975,18 +1995,18 @@ unicode_dealloc(PyObject *unicode) static int unicode_is_singleton(PyObject *unicode) { - if (unicode == unicode_empty) { + struct _Py_unicode_state *state = get_unicode_state(); + if (unicode == state->empty_string) { return 1; } -#ifdef LATIN1_SINGLETONS PyASCIIObject *ascii = (PyASCIIObject *)unicode; if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) { Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); - if (ch < 256 && unicode_latin1[ch] == unicode) + if (ch < 256 && state->latin1[ch] == unicode) { return 1; + } } -#endif return 0; } #endif @@ -2031,10 +2051,8 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) return 0; if (length == 0) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) - return -1; - Py_SETREF(*p_unicode, unicode_empty); + PyObject *empty = unicode_new_empty(); + Py_SETREF(*p_unicode, empty); return 0; } @@ -2126,17 +2144,15 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index, } static PyObject* -get_latin1_char(unsigned char ch) +get_latin1_char(Py_UCS1 ch) { - PyObject *unicode; + struct _Py_unicode_state *state = get_unicode_state(); -#ifdef LATIN1_SINGLETONS - unicode = unicode_latin1[ch]; + PyObject *unicode = state->latin1[ch]; if (unicode) { Py_INCREF(unicode); return unicode; } -#endif unicode = PyUnicode_New(1, ch); if (!unicode) { @@ -2146,10 +2162,8 @@ get_latin1_char(unsigned char ch) PyUnicode_1BYTE_DATA(unicode)[0] = ch; assert(_PyUnicode_CheckConsistency(unicode, 1)); -#ifdef LATIN1_SINGLETONS Py_INCREF(unicode); - unicode_latin1[ch] = unicode; -#endif + state->latin1[ch] = unicode; return unicode; } @@ -2160,8 +2174,9 @@ unicode_char(Py_UCS4 ch) assert(ch <= MAX_UNICODE); - if (ch < 256) + if (ch < 256) { return get_latin1_char(ch); + } unicode = PyUnicode_New(1, ch); if (unicode == NULL) @@ -2181,8 +2196,16 @@ unicode_char(Py_UCS4 ch) PyObject * PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) { - if (u == NULL) + if (u == NULL) { + if (size > 0) { + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "PyUnicode_FromUnicode(NULL, size) is deprecated; " + "use PyUnicode_New() instead", 1) < 0) { + return NULL; + } + } return (PyObject*)_PyUnicode_New(size); + } if (size < 0) { PyErr_BadInternalCall(); @@ -2282,10 +2305,19 @@ PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) "Negative size passed to PyUnicode_FromStringAndSize"); return NULL; } - if (u != NULL) + if (u != NULL) { return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); - else + } + else { + if (size > 0) { + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "PyUnicode_FromStringAndSize(NULL, size) is deprecated; " + "use PyUnicode_New() instead", 1) < 0) { + return NULL; + } + } return (PyObject *)_PyUnicode_New(size); + } } PyObject * @@ -2299,36 +2331,84 @@ PyUnicode_FromString(const char *u) return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); } + PyObject * _PyUnicode_FromId(_Py_Identifier *id) { - if (!id->object) { - id->object = PyUnicode_DecodeUTF8Stateful(id->string, - strlen(id->string), - NULL, NULL); - if (!id->object) + PyInterpreterState *interp = _PyInterpreterState_GET(); + struct _Py_unicode_ids *ids = &interp->unicode.ids; + + Py_ssize_t index = _Py_atomic_size_get(&id->index); + if (index < 0) { + struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids; + + PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK); + // Check again to detect concurrent access. Another thread can have + // initialized the index while this thread waited for the lock. + index = _Py_atomic_size_get(&id->index); + if (index < 0) { + assert(rt_ids->next_index < PY_SSIZE_T_MAX); + index = rt_ids->next_index; + rt_ids->next_index++; + _Py_atomic_size_set(&id->index, index); + } + PyThread_release_lock(rt_ids->lock); + } + assert(index >= 0); + + PyObject *obj; + if (index < ids->size) { + obj = ids->array[index]; + if (obj) { + // Return a borrowed reference + return obj; + } + } + + obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string), + NULL, NULL); + if (!obj) { + return NULL; + } + PyUnicode_InternInPlace(&obj); + + if (index >= ids->size) { + // Overallocate to reduce the number of realloc + Py_ssize_t new_size = Py_MAX(index * 2, 16); + Py_ssize_t item_size = sizeof(ids->array[0]); + PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size); + if (new_array == NULL) { + PyErr_NoMemory(); return NULL; - PyUnicode_InternInPlace(&id->object); - assert(!id->next); - id->next = static_strings; - static_strings = id; + } + memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size); + ids->array = new_array; + ids->size = new_size; } - return id->object; + + // The array stores a strong reference + ids->array[index] = obj; + + // Return a borrowed reference + return obj; } + static void -unicode_clear_static_strings(void) +unicode_clear_identifiers(struct _Py_unicode_state *state) { - _Py_Identifier *tmp, *s = static_strings; - while (s) { - Py_CLEAR(s->object); - tmp = s->next; - s->next = NULL; - s = tmp; + struct _Py_unicode_ids *ids = &state->ids; + for (Py_ssize_t i=0; i < ids->size; i++) { + Py_XDECREF(ids->array[i]); } - static_strings = NULL; + ids->size = 0; + PyMem_Free(ids->array); + ids->array = NULL; + // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid + // after Py_Finalize(). } + /* Internal function, doesn't check maximum character */ PyObject* @@ -2371,11 +2451,13 @@ _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) PyObject *res; unsigned char max_char; - if (size == 0) + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); + } assert(size > 0); - if (size == 1) + if (size == 1) { return get_latin1_char(u[0]); + } max_char = ucs1lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); @@ -2866,35 +2948,35 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, Py_ssize_t arglen; if (*f == 'u') { - if (longflag) - len = sprintf(buffer, "%lu", - va_arg(*vargs, unsigned long)); - else if (longlongflag) - len = sprintf(buffer, "%llu", - va_arg(*vargs, unsigned long long)); - else if (size_tflag) - len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", - va_arg(*vargs, size_t)); - else - len = sprintf(buffer, "%u", - va_arg(*vargs, unsigned int)); + if (longflag) { + len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long)); + } + else if (longlongflag) { + len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long)); + } + else if (size_tflag) { + len = sprintf(buffer, "%zu", va_arg(*vargs, size_t)); + } + else { + len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int)); + } } else if (*f == 'x') { len = sprintf(buffer, "%x", va_arg(*vargs, int)); } else { - if (longflag) - len = sprintf(buffer, "%li", - va_arg(*vargs, long)); - else if (longlongflag) - len = sprintf(buffer, "%lli", - va_arg(*vargs, long long)); - else if (size_tflag) - len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", - va_arg(*vargs, Py_ssize_t)); - else - len = sprintf(buffer, "%i", - va_arg(*vargs, int)); + if (longflag) { + len = sprintf(buffer, "%li", va_arg(*vargs, long)); + } + else if (longlongflag) { + len = sprintf(buffer, "%lli", va_arg(*vargs, long long)); + } + else if (size_tflag) { + len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t)); + } + else { + len = sprintf(buffer, "%i", va_arg(*vargs, int)); + } } assert(len >= 0); @@ -3134,9 +3216,11 @@ unicode_get_widechar_size(PyObject *unicode) assert(unicode != NULL); assert(_PyUnicode_CHECK(unicode)); +#if USE_UNICODE_WCHAR_CACHE if (_PyUnicode_WSTR(unicode) != NULL) { return PyUnicode_WSTR_LENGTH(unicode); } +#endif /* USE_UNICODE_WCHAR_CACHE */ assert(PyUnicode_IS_READY(unicode)); res = _PyUnicode_LENGTH(unicode); @@ -3157,16 +3241,21 @@ unicode_get_widechar_size(PyObject *unicode) static void unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size) { - const wchar_t *wstr; - assert(unicode != NULL); assert(_PyUnicode_CHECK(unicode)); - wstr = _PyUnicode_WSTR(unicode); +#if USE_UNICODE_WCHAR_CACHE + const wchar_t *wstr = _PyUnicode_WSTR(unicode); if (wstr != NULL) { memcpy(w, wstr, size * sizeof(wchar_t)); return; } +#else /* USE_UNICODE_WCHAR_CACHE */ + if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) { + memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t)); + return; + } +#endif /* USE_UNICODE_WCHAR_CACHE */ assert(PyUnicode_IS_READY(unicode)); if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { @@ -3293,7 +3382,7 @@ PyUnicode_AsWideCharString(PyObject *unicode, *size = buflen; } else if (wcslen(buffer) != (size_t)buflen) { - PyMem_FREE(buffer); + PyMem_Free(buffer); PyErr_SetString(PyExc_ValueError, "embedded null character"); return NULL; @@ -3303,6 +3392,74 @@ PyUnicode_AsWideCharString(PyObject *unicode, #endif /* HAVE_WCHAR_H */ +int +_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr) +{ + wchar_t **p = (wchar_t **)ptr; + if (obj == NULL) { +#if !USE_UNICODE_WCHAR_CACHE + PyMem_Free(*p); +#endif /* USE_UNICODE_WCHAR_CACHE */ + *p = NULL; + return 1; + } + if (PyUnicode_Check(obj)) { +#if USE_UNICODE_WCHAR_CACHE + *p = (wchar_t *)_PyUnicode_AsUnicode(obj); + if (*p == NULL) { + return 0; + } + return 1; +#else /* USE_UNICODE_WCHAR_CACHE */ + *p = PyUnicode_AsWideCharString(obj, NULL); + if (*p == NULL) { + return 0; + } + return Py_CLEANUP_SUPPORTED; +#endif /* USE_UNICODE_WCHAR_CACHE */ + } + PyErr_Format(PyExc_TypeError, + "argument must be str, not %.50s", + Py_TYPE(obj)->tp_name); + return 0; +} + +int +_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr) +{ + wchar_t **p = (wchar_t **)ptr; + if (obj == NULL) { +#if !USE_UNICODE_WCHAR_CACHE + PyMem_Free(*p); +#endif /* USE_UNICODE_WCHAR_CACHE */ + *p = NULL; + return 1; + } + if (obj == Py_None) { + *p = NULL; + return 1; + } + if (PyUnicode_Check(obj)) { +#if USE_UNICODE_WCHAR_CACHE + *p = (wchar_t *)_PyUnicode_AsUnicode(obj); + if (*p == NULL) { + return 0; + } + return 1; +#else /* USE_UNICODE_WCHAR_CACHE */ + *p = PyUnicode_AsWideCharString(obj, NULL); + if (*p == NULL) { + return 0; + } + return Py_CLEANUP_SUPPORTED; +#endif /* USE_UNICODE_WCHAR_CACHE */ + } + PyErr_Format(PyExc_TypeError, + "argument must be str or None, not %.50s", + Py_TYPE(obj)->tp_name); + return 0; +} + PyObject * PyUnicode_FromOrdinal(int ordinal) { @@ -4126,7 +4283,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) PyErr_NoMemory(); return NULL; } - w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1)); + w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1)); if (w == NULL) { PyErr_NoMemory(); return NULL; @@ -4316,7 +4473,6 @@ unicode_decode_call_errorhandler_wchar( Py_ssize_t requiredsize; Py_ssize_t newpos; PyObject *inputobj = NULL; - wchar_t *repwstr; Py_ssize_t repwlen; if (*errorHandler == NULL) { @@ -4362,12 +4518,19 @@ unicode_decode_call_errorhandler_wchar( goto onError; } +#if USE_UNICODE_WCHAR_CACHE _Py_COMP_DIAG_PUSH _Py_COMP_DIAG_IGNORE_DEPR_DECLS - repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); + repwlen = PyUnicode_GetSize(repunicode); + if (repwlen < 0) + goto onError; _Py_COMP_DIAG_POP - if (repwstr == NULL) +#else /* USE_UNICODE_WCHAR_CACHE */ + repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0); + if (repwlen < 0) goto onError; + repwlen--; +#endif /* USE_UNICODE_WCHAR_CACHE */ /* need more space? (at least enough for what we have+the replacement+the rest of the string (starting at the new input position), so we won't have to check space @@ -4387,7 +4550,7 @@ _Py_COMP_DIAG_POP goto onError; } } - wcsncpy(*buf + *outpos, repwstr, repwlen); + PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen); *outpos += repwlen; *endinpos = newpos; *inptr = *input + newpos; @@ -4950,45 +5113,36 @@ PyUnicode_DecodeUTF8(const char *s, #include "stringlib/codecs.h" #include "stringlib/undef.h" -/* Mask to quickly check whether a C 'long' contains a +/* Mask to quickly check whether a C 'size_t' contains a non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_LONG == 8) -# define ASCII_CHAR_MASK 0x8080808080808080UL -#elif (SIZEOF_LONG == 4) -# define ASCII_CHAR_MASK 0x80808080UL +#if (SIZEOF_SIZE_T == 8) +# define ASCII_CHAR_MASK 0x8080808080808080ULL +#elif (SIZEOF_SIZE_T == 4) +# define ASCII_CHAR_MASK 0x80808080U #else -# error C 'long' size should be either 4 or 8! +# error C 'size_t' size should be either 4 or 8! #endif static Py_ssize_t ascii_decode(const char *start, const char *end, Py_UCS1 *dest) { const char *p = start; - const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); - /* - * Issue #17237: m68k is a bit different from most architectures in - * that objects do not use "natural alignment" - for example, int and - * long are only aligned at 2-byte boundaries. Therefore the assert() - * won't work; also, tests have shown that skipping the "optimised - * version" will even speed up m68k. - */ -#if !defined(__m68k__) -#if SIZEOF_LONG <= SIZEOF_VOID_P - assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); - if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { +#if SIZEOF_SIZE_T <= SIZEOF_VOID_P + assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T)); + if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { /* Fast path, see in STRINGLIB(utf8_decode) for an explanation. */ /* Help allocation */ const char *_p = p; Py_UCS1 * q = dest; - while (_p < aligned_end) { - unsigned long value = *(const unsigned long *) _p; + while (_p + SIZEOF_SIZE_T <= end) { + size_t value = *(const size_t *) _p; if (value & ASCII_CHAR_MASK) break; - *((unsigned long *)q) = value; - _p += SIZEOF_LONG; - q += SIZEOF_LONG; + *((size_t *)q) = value; + _p += SIZEOF_SIZE_T; + q += SIZEOF_SIZE_T; } p = _p; while (p < end) { @@ -4999,18 +5153,17 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest) return p - start; } #endif -#endif while (p < end) { /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h for an explanation. */ - if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { + if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { /* Help allocation */ const char *_p = p; - while (_p < aligned_end) { - unsigned long value = *(const unsigned long *) _p; + while (_p + SIZEOF_SIZE_T <= end) { + size_t value = *(const size_t *) _p; if (value & ASCII_CHAR_MASK) break; - _p += SIZEOF_LONG; + _p += SIZEOF_SIZE_T; } p = _p; if (_p == end) @@ -5037,8 +5190,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, /* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) { - if (consumed) + if (consumed) { *consumed = 1; + } return get_latin1_char((unsigned char)s[0]); } @@ -5547,7 +5701,7 @@ unicode_fill_utf8(PyObject *unicode) PyBytes_AS_STRING(writer.buffer); Py_ssize_t len = end - start; - char *cache = PyObject_MALLOC(len + 1); + char *cache = PyObject_Malloc(len + 1); if (cache == NULL) { _PyBytesWriter_Dealloc(&writer); PyErr_NoMemory(); @@ -6268,7 +6422,7 @@ PyUnicode_AsUTF16String(PyObject *unicode) /* --- Unicode Escape Codec ----------------------------------------------- */ -static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; +static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; PyObject * _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, @@ -6423,11 +6577,11 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, /* \N{name} */ case 'N': - if (ucnhash_CAPI == NULL) { + if (ucnhash_capi == NULL) { /* load the unicode data module */ - ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( + ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import( PyUnicodeData_CAPSULE_NAME, 1); - if (ucnhash_CAPI == NULL) { + if (ucnhash_capi == NULL) { PyErr_SetString( PyExc_UnicodeError, "\\N escapes not supported (can't load unicodedata module)" @@ -6455,7 +6609,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, s++; ch = 0xffffffff; /* in case 'getcode' messes up */ if (namelen <= INT_MAX && - ucnhash_CAPI->getcode(NULL, start, (int)namelen, + ucnhash_capi->getcode(start, (int)namelen, &ch, 0)) { assert(ch <= MAX_UNICODE); WRITE_CHAR(ch); @@ -7254,8 +7408,9 @@ PyUnicode_DecodeASCII(const char *s, _Py_RETURN_UNICODE_EMPTY(); /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 && (unsigned char)s[0] < 128) + if (size == 1 && (unsigned char)s[0] < 128) { return get_latin1_char((unsigned char)s[0]); + } // Shortcut for simple case PyObject *u = PyUnicode_New(size, 127); @@ -7736,6 +7891,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, /* Create a substring so that we can get the UTF-16 representation of just the slice under consideration. */ PyObject *substring; + int ret = -1; assert(len > 0); @@ -7747,14 +7903,22 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, substring = PyUnicode_Substring(unicode, offset, offset+len); if (substring == NULL) return -1; +#if USE_UNICODE_WCHAR_CACHE _Py_COMP_DIAG_PUSH _Py_COMP_DIAG_IGNORE_DEPR_DECLS p = PyUnicode_AsUnicodeAndSize(substring, &size); -_Py_COMP_DIAG_POP if (p == NULL) { Py_DECREF(substring); return -1; } +_Py_COMP_DIAG_POP +#else /* USE_UNICODE_WCHAR_CACHE */ + p = PyUnicode_AsWideCharString(substring, &size); + Py_CLEAR(substring); + if (p == NULL) { + return -1; + } +#endif /* USE_UNICODE_WCHAR_CACHE */ assert(size <= INT_MAX); /* First get the size of the result */ @@ -7766,16 +7930,15 @@ _Py_COMP_DIAG_POP goto error; /* If we used a default char, then we failed! */ if (pusedDefaultChar && *pusedDefaultChar) { - Py_DECREF(substring); - return -2; + ret = -2; + goto done; } if (*outbytes == NULL) { /* Create string object */ *outbytes = PyBytes_FromStringAndSize(NULL, outsize); if (*outbytes == NULL) { - Py_DECREF(substring); - return -1; + goto done; } out = PyBytes_AS_STRING(*outbytes); } @@ -7784,12 +7947,10 @@ _Py_COMP_DIAG_POP const Py_ssize_t n = PyBytes_Size(*outbytes); if (outsize > PY_SSIZE_T_MAX - n) { PyErr_NoMemory(); - Py_DECREF(substring); - return -1; + goto done; } if (_PyBytes_Resize(outbytes, n + outsize) < 0) { - Py_DECREF(substring); - return -1; + goto done; } out = PyBytes_AS_STRING(*outbytes) + n; } @@ -7799,19 +7960,29 @@ _Py_COMP_DIAG_POP p, (int)size, out, outsize, NULL, pusedDefaultChar); - Py_CLEAR(substring); if (outsize <= 0) goto error; - if (pusedDefaultChar && *pusedDefaultChar) - return -2; - return 0; + if (pusedDefaultChar && *pusedDefaultChar) { + ret = -2; + goto done; + } + ret = 0; + +done: +#if USE_UNICODE_WCHAR_CACHE + Py_DECREF(substring); +#else /* USE_UNICODE_WCHAR_CACHE */ + PyMem_Free(p); +#endif /* USE_UNICODE_WCHAR_CACHE */ + return ret; error: - Py_XDECREF(substring); - if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) - return -2; + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) { + ret = -2; + goto done; + } PyErr_SetFromWindowsErr(0); - return -1; + goto done; } /* @@ -8496,11 +8667,13 @@ PyUnicode_BuildEncodingMap(PyObject* string) } /* Create a three-level trie */ - result = PyObject_MALLOC(sizeof(struct encoding_map) + + result = PyObject_Malloc(sizeof(struct encoding_map) + 16*count2 + 128*count3 - 1); - if (!result) + if (!result) { return PyErr_NoMemory(); - PyObject_Init(result, &EncodingMapType); + } + + _PyObject_Init(result, &EncodingMapType); mresult = (struct encoding_map*)result; mresult->count2 = count2; mresult->count3 = count3; @@ -10161,7 +10334,7 @@ case_operation(PyObject *self, PyErr_SetString(PyExc_OverflowError, "string is too long"); return NULL; } - tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); + tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length); if (tmp == NULL) return PyErr_NoMemory(); newlength = perform(kind, data, length, tmp, &maxchar); @@ -10185,7 +10358,7 @@ case_operation(PyObject *self, Py_UNREACHABLE(); } leave: - PyMem_FREE(tmp); + PyMem_Free(tmp); return res; } @@ -10924,10 +11097,7 @@ replace(PyObject *self, PyObject *str1, } new_size = slen + n * (len2 - len1); if (new_size == 0) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) - goto error; - u = unicode_empty; + u = unicode_new_empty(); goto done; } if (new_size > (PY_SSIZE_T_MAX / rkind)) { @@ -11003,11 +11173,11 @@ replace(PyObject *self, PyObject *str1, assert(release1 == (buf1 != PyUnicode_DATA(str1))); assert(release2 == (buf2 != PyUnicode_DATA(str2))); if (srelease) - PyMem_FREE((void *)sbuf); + PyMem_Free((void *)sbuf); if (release1) - PyMem_FREE((void *)buf1); + PyMem_Free((void *)buf1); if (release2) - PyMem_FREE((void *)buf2); + PyMem_Free((void *)buf2); assert(_PyUnicode_CheckConsistency(u, 1)); return u; @@ -11017,11 +11187,11 @@ replace(PyObject *self, PyObject *str1, assert(release1 == (buf1 != PyUnicode_DATA(str1))); assert(release2 == (buf2 != PyUnicode_DATA(str2))); if (srelease) - PyMem_FREE((void *)sbuf); + PyMem_Free((void *)sbuf); if (release1) - PyMem_FREE((void *)buf1); + PyMem_Free((void *)buf1); if (release2) - PyMem_FREE((void *)buf2); + PyMem_Free((void *)buf2); return unicode_result_unchanged(self); error: @@ -11029,11 +11199,11 @@ replace(PyObject *self, PyObject *str1, assert(release1 == (buf1 != PyUnicode_DATA(str1))); assert(release2 == (buf2 != PyUnicode_DATA(str2))); if (srelease) - PyMem_FREE((void *)sbuf); + PyMem_Free((void *)sbuf); if (release1) - PyMem_FREE((void *)buf1); + PyMem_Free((void *)buf1); if (release2) - PyMem_FREE((void *)buf2); + PyMem_Free((void *)buf2); return NULL; } @@ -11450,8 +11620,9 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right) #ifdef INTERNED_STRINGS assert(_PyUnicode_HASH(right_uni) != -1); Py_hash_t hash = _PyUnicode_HASH(left); - if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) + if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) { return 0; + } #endif return unicode_compare_eq(left, right_uni); @@ -11585,10 +11756,13 @@ PyUnicode_Concat(PyObject *left, PyObject *right) return NULL; /* Shortcuts */ - if (left == unicode_empty) + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (left == empty) { return PyUnicode_FromObject(right); - if (right == unicode_empty) + } + if (right == empty) { return PyUnicode_FromObject(left); + } left_len = PyUnicode_GET_LENGTH(left); right_len = PyUnicode_GET_LENGTH(right); @@ -11639,14 +11813,16 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) goto error; /* Shortcuts */ - if (left == unicode_empty) { + PyObject *empty = unicode_get_empty(); // Borrowed reference + if (left == empty) { Py_DECREF(left); Py_INCREF(right); *p_left = right; return; } - if (right == unicode_empty) + if (right == empty) { return; + } left_len = PyUnicode_GET_LENGTH(left); right_len = PyUnicode_GET_LENGTH(right); @@ -13343,14 +13519,8 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) - out = NULL; - else { - out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); - Py_DECREF(unicode_empty); - } - return out; + PyObject *empty = unicode_get_empty(); // Borrowed reference + return PyTuple_Pack(3, str_obj, empty, empty); } buf1 = PyUnicode_DATA(str_obj); buf2 = PyUnicode_DATA(sep_obj); @@ -13401,14 +13571,8 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) - out = NULL; - else { - out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); - Py_DECREF(unicode_empty); - } - return out; + PyObject *empty = unicode_get_empty(); // Borrowed reference + return PyTuple_Pack(3, empty, empty, str_obj); } buf1 = PyUnicode_DATA(str_obj); buf2 = PyUnicode_DATA(sep_obj); @@ -14731,20 +14895,15 @@ mainformatlong(PyObject *v, /* make sure number is a type of integer for o, x, and X */ if (!PyLong_Check(v)) { if (type == 'o' || type == 'x' || type == 'X') { - iobj = PyNumber_Index(v); - if (iobj == NULL) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - goto wrongtype; - return -1; - } + iobj = _PyNumber_Index(v); } else { iobj = PyNumber_Long(v); - if (iobj == NULL ) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - goto wrongtype; - return -1; - } + } + if (iobj == NULL ) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + goto wrongtype; + return -1; } assert(PyLong_Check(iobj)); } @@ -14808,7 +14967,7 @@ wrongtype: break; default: PyErr_Format(PyExc_TypeError, - "%%%c format: a number is required, " + "%%%c format: a real number is required, " "not %.200s", type, Py_TYPE(v)->tp_name); break; @@ -14827,24 +14986,17 @@ formatchar(PyObject *v) goto onError; } else { - PyObject *iobj; - long x; - /* make sure number is a type of integer */ - if (!PyLong_Check(v)) { - iobj = PyNumber_Index(v); - if (iobj == NULL) { + int overflow; + long x = PyLong_AsLongAndOverflow(v, &overflow); + if (x == -1 && PyErr_Occurred()) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { goto onError; } - x = PyLong_AsLong(iobj); - Py_DECREF(iobj); - } - else { - x = PyLong_AsLong(v); + return (Py_UCS4) -1; } - if (x == -1 && PyErr_Occurred()) - goto onError; if (x < 0 || x > MAX_UNICODE) { + /* this includes an overflow in converting to C long */ PyErr_SetString(PyExc_OverflowError, "%c arg not in range(0x110000)"); return (Py_UCS4) -1; @@ -15442,52 +15594,57 @@ PyUnicode_Format(PyObject *format, PyObject *args) } static PyObject * -unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); +unicode_subtype_new(PyTypeObject *type, PyObject *unicode); + +/*[clinic input] +@classmethod +str.__new__ as unicode_new + + object as x: object = NULL + encoding: str = NULL + errors: str = NULL + +[clinic start generated code]*/ static PyObject * -unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding, + const char *errors) +/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/ { - PyObject *x = NULL; - static char *kwlist[] = {"object", "encoding", "errors", 0}; - char *encoding = NULL; - char *errors = NULL; + PyObject *unicode; + if (x == NULL) { + unicode = unicode_new_empty(); + } + else if (encoding == NULL && errors == NULL) { + unicode = PyObject_Str(x); + } + else { + unicode = PyUnicode_FromEncodedObject(x, encoding, errors); + } - if (type != &PyUnicode_Type) - return unicode_subtype_new(type, args, kwds); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", - kwlist, &x, &encoding, &errors)) - return NULL; - if (x == NULL) - _Py_RETURN_UNICODE_EMPTY(); - if (encoding == NULL && errors == NULL) - return PyObject_Str(x); - else - return PyUnicode_FromEncodedObject(x, encoding, errors); + if (unicode != NULL && type != &PyUnicode_Type) { + Py_SETREF(unicode, unicode_subtype_new(type, unicode)); + } + return unicode; } static PyObject * -unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +unicode_subtype_new(PyTypeObject *type, PyObject *unicode) { - PyObject *unicode, *self; + PyObject *self; Py_ssize_t length, char_size; int share_wstr, share_utf8; unsigned int kind; void *data; assert(PyType_IsSubtype(type, &PyUnicode_Type)); - - unicode = unicode_new(&PyUnicode_Type, args, kwds); - if (unicode == NULL) - return NULL; assert(_PyUnicode_CHECK(unicode)); if (PyUnicode_READY(unicode) == -1) { - Py_DECREF(unicode); return NULL; } self = type->tp_alloc(type, 0); if (self == NULL) { - Py_DECREF(unicode); return NULL; } kind = PyUnicode_KIND(unicode); @@ -15534,7 +15691,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) PyErr_NoMemory(); goto onError; } - data = PyObject_MALLOC((length + 1) * char_size); + data = PyObject_Malloc((length + 1) * char_size); if (data == NULL) { PyErr_NoMemory(); goto onError; @@ -15556,11 +15713,9 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) #ifdef Py_DEBUG _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); #endif - Py_DECREF(unicode); return self; onError: - Py_DECREF(unicode); Py_DECREF(self); return NULL; } @@ -15601,7 +15756,8 @@ PyTypeObject PyUnicode_Type = { 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | - Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ + Py_TPFLAGS_UNICODE_SUBCLASS | + _Py_TPFLAGS_MATCH_SELF, /* tp_flags */ unicode_doc, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ @@ -15626,36 +15782,40 @@ PyTypeObject PyUnicode_Type = { /* Initialize the Unicode implementation */ PyStatus -_PyUnicode_Init(void) +_PyUnicode_Init(PyInterpreterState *interp) { - /* XXX - move this array to unicodectype.c ? */ - Py_UCS2 linebreak[] = { - 0x000A, /* LINE FEED */ - 0x000D, /* CARRIAGE RETURN */ - 0x001C, /* FILE SEPARATOR */ - 0x001D, /* GROUP SEPARATOR */ - 0x001E, /* RECORD SEPARATOR */ - 0x0085, /* NEXT LINE */ - 0x2028, /* LINE SEPARATOR */ - 0x2029, /* PARAGRAPH SEPARATOR */ - }; + struct _Py_unicode_state *state = &interp->unicode; + if (unicode_create_empty_string_singleton(state) < 0) { + return _PyStatus_NO_MEMORY(); + } - /* Init the implementation */ - _Py_INCREF_UNICODE_EMPTY(); - if (!unicode_empty) { - return _PyStatus_ERR("Can't create empty string"); + if (_Py_IsMainInterpreter(interp)) { + /* initialize the linebreak bloom filter */ + const Py_UCS2 linebreak[] = { + 0x000A, /* LINE FEED */ + 0x000D, /* CARRIAGE RETURN */ + 0x001C, /* FILE SEPARATOR */ + 0x001D, /* GROUP SEPARATOR */ + 0x001E, /* RECORD SEPARATOR */ + 0x0085, /* NEXT LINE */ + 0x2028, /* LINE SEPARATOR */ + 0x2029, /* PARAGRAPH SEPARATOR */ + }; + bloom_linebreak = make_bloom_mask( + PyUnicode_2BYTE_KIND, linebreak, + Py_ARRAY_LENGTH(linebreak)); } - Py_DECREF(unicode_empty); + return _PyStatus_OK(); +} + + +PyStatus +_PyUnicode_InitTypes(void) +{ if (PyType_Ready(&PyUnicode_Type) < 0) { return _PyStatus_ERR("Can't initialize unicode type"); } - - /* initialize the linebreak bloom filter */ - bloom_linebreak = make_bloom_mask( - PyUnicode_2BYTE_KIND, linebreak, - Py_ARRAY_LENGTH(linebreak)); - if (PyType_Ready(&EncodingMapType) < 0) { return _PyStatus_ERR("Can't initialize encoding map type"); } @@ -15693,6 +15853,11 @@ PyUnicode_InternInPlace(PyObject **p) } #ifdef INTERNED_STRINGS + if (PyUnicode_READY(s) == -1) { + PyErr_Clear(); + return; + } + if (interned == NULL) { interned = PyDict_New(); if (interned == NULL) { @@ -15701,9 +15866,7 @@ PyUnicode_InternInPlace(PyObject **p) } } - PyObject *t; - t = PyDict_SetDefault(interned, s, s); - + PyObject *t = PyDict_SetDefault(interned, s, s); if (t == NULL) { PyErr_Clear(); return; @@ -15715,16 +15878,30 @@ PyUnicode_InternInPlace(PyObject **p) return; } - /* The two references in interned are not counted by refcnt. - The deallocator will take care of this */ + /* The two references in interned dict (key and value) are not counted by + refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of + this. */ Py_SET_REFCNT(s, Py_REFCNT(s) - 2); _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; +#else + // PyDict expects that interned strings have their hash + // (PyASCIIObject.hash) already computed. + (void)unicode_hash(s); #endif } void PyUnicode_InternImmortal(PyObject **p) { + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "PyUnicode_InternImmortal() is deprecated; " + "use PyUnicode_InternInPlace() instead", 1) < 0) + { + // The function has no return value, the exception cannot + // be reported to the caller, so just log it. + PyErr_WriteUnraisable(NULL); + } + PyUnicode_InternInPlace(p); if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; @@ -15743,45 +15920,45 @@ PyUnicode_InternFromString(const char *cp) } -#if defined(WITH_VALGRIND) || defined(__INSURE__) -static void -unicode_release_interned(void) +void +_PyUnicode_ClearInterned(PyInterpreterState *interp) { - if (interned == NULL || !PyDict_Check(interned)) { + if (!_Py_IsMainInterpreter(interp)) { + // interned dict is shared by all interpreters return; } - PyObject *keys = PyDict_Keys(interned); - if (keys == NULL || !PyList_Check(keys)) { - PyErr_Clear(); + + if (interned == NULL) { return; } + assert(PyDict_CheckExact(interned)); - /* Since unicode_release_interned() is intended to help a leak - detector, interned unicode strings are not forcibly deallocated; - rather, we give them their stolen references back, and then clear - and DECREF the interned dict. */ + /* Interned unicode strings are not forcibly deallocated; rather, we give + them their stolen references back, and then clear and DECREF the + interned dict. */ - Py_ssize_t n = PyList_GET_SIZE(keys); #ifdef INTERNED_STATS - fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", - n); + fprintf(stderr, "releasing %zd interned strings\n", + PyDict_GET_SIZE(interned)); Py_ssize_t immortal_size = 0, mortal_size = 0; #endif - for (Py_ssize_t i = 0; i < n; i++) { - PyObject *s = PyList_GET_ITEM(keys, i); - if (PyUnicode_READY(s) == -1) { - Py_UNREACHABLE(); - } + Py_ssize_t pos = 0; + PyObject *s, *ignored_value; + while (PyDict_Next(interned, &pos, &s, &ignored_value)) { + assert(PyUnicode_IS_READY(s)); + switch (PyUnicode_CHECK_INTERNED(s)) { case SSTATE_INTERNED_IMMORTAL: - Py_REFCNT(s) += 1; + Py_SET_REFCNT(s, Py_REFCNT(s) + 1); #ifdef INTERNED_STATS immortal_size += PyUnicode_GET_LENGTH(s); #endif break; case SSTATE_INTERNED_MORTAL: - Py_REFCNT(s) += 2; + // Restore the two references (key and value) ignored + // by PyUnicode_InternInPlace(). + Py_SET_REFCNT(s, Py_REFCNT(s) + 2); #ifdef INTERNED_STATS mortal_size += PyUnicode_GET_LENGTH(s); #endif @@ -15794,15 +15971,14 @@ unicode_release_interned(void) _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; } #ifdef INTERNED_STATS - fprintf(stderr, "total size of all interned strings: " - "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " - "mortal/immortal\n", mortal_size, immortal_size); + fprintf(stderr, + "total size of all interned strings: %zd/%zd mortal/immortal\n", + mortal_size, immortal_size); #endif - Py_DECREF(keys); + PyDict_Clear(interned); Py_CLEAR(interned); } -#endif /********************* Unicode Iterator **************************/ @@ -15964,127 +16140,6 @@ unicode_iter(PyObject *seq) return (PyObject *)it; } - -size_t -Py_UNICODE_strlen(const Py_UNICODE *u) -{ - return wcslen(u); -} - -Py_UNICODE* -Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) -{ - Py_UNICODE *u = s1; - while ((*u++ = *s2++)); - return s1; -} - -Py_UNICODE* -Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) -{ - Py_UNICODE *u = s1; - while ((*u++ = *s2++)) - if (n-- == 0) - break; - return s1; -} - -Py_UNICODE* -Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) -{ - Py_UNICODE *u1 = s1; - u1 += wcslen(u1); - while ((*u1++ = *s2++)); - return s1; -} - -int -Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) -{ - while (*s1 && *s2 && *s1 == *s2) - s1++, s2++; - if (*s1 && *s2) - return (*s1 < *s2) ? -1 : +1; - if (*s1) - return 1; - if (*s2) - return -1; - return 0; -} - -int -Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) -{ - Py_UNICODE u1, u2; - for (; n != 0; n--) { - u1 = *s1; - u2 = *s2; - if (u1 != u2) - return (u1 < u2) ? -1 : +1; - if (u1 == '\0') - return 0; - s1++; - s2++; - } - return 0; -} - -Py_UNICODE* -Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) -{ - const Py_UNICODE *p; - for (p = s; *p; p++) - if (*p == c) - return (Py_UNICODE*)p; - return NULL; -} - -Py_UNICODE* -Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) -{ - const Py_UNICODE *p; - p = s + wcslen(s); - while (p != s) { - p--; - if (*p == c) - return (Py_UNICODE*)p; - } - return NULL; -} - -Py_UNICODE* -PyUnicode_AsUnicodeCopy(PyObject *unicode) -{ - Py_UNICODE *u, *copy; - Py_ssize_t len, size; - - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } -_Py_COMP_DIAG_PUSH -_Py_COMP_DIAG_IGNORE_DEPR_DECLS - u = PyUnicode_AsUnicodeAndSize(unicode, &len); -_Py_COMP_DIAG_POP - if (u == NULL) - return NULL; - /* Ensure we won't overflow the size. */ - if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { - PyErr_NoMemory(); - return NULL; - } - size = len + 1; /* copy the null character */ - size *= sizeof(Py_UNICODE); - copy = PyMem_Malloc(size); - if (copy == NULL) { - PyErr_NoMemory(); - return NULL; - } - memcpy(copy, u, size); - return copy; -} - - static int encode_wstr_utf8(wchar_t *wstr, char **str, const char *name) { @@ -16150,10 +16205,10 @@ error: static PyStatus -init_stdio_encoding(PyThreadState *tstate) +init_stdio_encoding(PyInterpreterState *interp) { /* Update the stdio encoding to the normalized Python codec name. */ - PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp); + PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp); if (config_get_codec_name(&config->stdio_encoding) < 0) { return _PyStatus_ERR("failed to get the Python codec name " "of the stdio encoding"); @@ -16246,7 +16301,7 @@ _PyUnicode_InitEncodings(PyThreadState *tstate) return status; } - return init_stdio_encoding(tstate); + return init_stdio_encoding(tstate->interp); } @@ -16290,33 +16345,23 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void) void -_PyUnicode_Fini(PyThreadState *tstate) +_PyUnicode_Fini(PyInterpreterState *interp) { - if (_Py_IsMainInterpreter(tstate)) { -#if defined(WITH_VALGRIND) || defined(__INSURE__) - /* Insure++ is a memory analysis tool that aids in discovering - * memory leaks and other memory problems. On Python exit, the - * interned string dictionaries are flagged as being in use at exit - * (which it is). Under normal circumstances, this is fine because - * the memory will be automatically reclaimed by the system. Under - * memory debugging, it's a huge source of useless noise, so we - * trade off slower shutdown for less distraction in the memory - * reports. -baw - */ - unicode_release_interned(); -#endif /* __INSURE__ */ + struct _Py_unicode_state *state = &interp->unicode; - Py_CLEAR(unicode_empty); - -#ifdef LATIN1_SINGLETONS - for (Py_ssize_t i = 0; i < 256; i++) { - Py_CLEAR(unicode_latin1[i]); - } -#endif - unicode_clear_static_strings(); + if (_Py_IsMainInterpreter(interp)) { + // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini() + assert(interned == NULL); } - _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec); + _PyUnicode_FiniEncodings(&state->fs_codec); + + unicode_clear_identifiers(state); + + for (Py_ssize_t i = 0; i < 256; i++) { + Py_CLEAR(state->latin1[i]); + } + Py_CLEAR(state->empty_string); } @@ -16333,20 +16378,16 @@ static PyMethodDef _string_methods[] = { static struct PyModuleDef _string_module = { PyModuleDef_HEAD_INIT, - "_string", - PyDoc_STR("string helper module"), - 0, - _string_methods, - NULL, - NULL, - NULL, - NULL + .m_name = "_string", + .m_doc = PyDoc_STR("string helper module"), + .m_size = 0, + .m_methods = _string_methods, }; PyMODINIT_FUNC PyInit__string(void) { - return PyModule_Create(&_string_module); + return PyModuleDef_Init(&_string_module); } |
