summaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/src/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/tools/python3/src/Objects/unicodeobject.c')
-rw-r--r--contrib/tools/python3/src/Objects/unicodeobject.c2196
1 files changed, 702 insertions, 1494 deletions
diff --git a/contrib/tools/python3/src/Objects/unicodeobject.c b/contrib/tools/python3/src/Objects/unicodeobject.c
index b5510fc1c85..f6787c8f8aa 100644
--- a/contrib/tools/python3/src/Objects/unicodeobject.c
+++ b/contrib/tools/python3/src/Objects/unicodeobject.c
@@ -54,7 +54,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
+#include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()
#include "stringlib/eq.h" // unicode_eq()
+#include <stddef.h> // ptrdiff_t
#ifdef MS_WINDOWS
#include <windows.h>
@@ -115,7 +117,6 @@ extern "C" {
(_PyCompactUnicodeObject_CAST(op)->utf8)
#define PyUnicode_UTF8(op) \
(assert(_PyUnicode_CHECK(op)), \
- assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((char*)(_PyASCIIObject_CAST(op) + 1)) : \
_PyUnicode_UTF8(op))
@@ -123,21 +124,10 @@ extern "C" {
(_PyCompactUnicodeObject_CAST(op)->utf8_length)
#define PyUnicode_UTF8_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
- assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
_PyASCIIObject_CAST(op)->length : \
_PyUnicode_UTF8_LENGTH(op))
-#define _PyUnicode_WSTR(op) \
- (_PyASCIIObject_CAST(op)->wstr)
-/* Don't use deprecated macro of unicodeobject.h */
-#undef PyUnicode_WSTR_LENGTH
-#define PyUnicode_WSTR_LENGTH(op) \
- (PyUnicode_IS_COMPACT_ASCII(op) ? \
- _PyASCIIObject_CAST(op)->length : \
- _PyCompactUnicodeObject_CAST(op)->wstr_length)
-#define _PyUnicode_WSTR_LENGTH(op) \
- (_PyCompactUnicodeObject_CAST(op)->wstr_length)
#define _PyUnicode_LENGTH(op) \
(_PyASCIIObject_CAST(op)->length)
#define _PyUnicode_STATE(op) \
@@ -153,20 +143,10 @@ extern "C" {
#define _PyUnicode_DATA_ANY(op) \
(_PyUnicodeObject_CAST(op)->data.any)
-#undef PyUnicode_READY
-#define PyUnicode_READY(op) \
- (assert(_PyUnicode_CHECK(op)), \
- (PyUnicode_IS_READY(op) ? \
- 0 : \
- _PyUnicode_Ready(op)))
-
#define _PyUnicode_SHARE_UTF8(op) \
(assert(_PyUnicode_CHECK(op)), \
assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
(_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
-#define _PyUnicode_SHARE_WSTR(op) \
- (assert(_PyUnicode_CHECK(op)), \
- (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
/* true if the Unicode object has an allocated UTF-8 memory block
(not shared with other data) */
@@ -175,13 +155,6 @@ extern "C" {
&& _PyUnicode_UTF8(op) \
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
-/* true if the Unicode object has an allocated wstr memory block
- (not shared with other data) */
-#define _PyUnicode_HAS_WSTR_MEMORY(op) \
- ((_PyUnicode_WSTR(op) && \
- (!PyUnicode_IS_READY(op) || \
- _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
-
/* Generic helper macro to convert characters of different types.
from_type and to_type have to be valid type names, begin and end
are pointers to the source characters which should be of type
@@ -219,16 +192,6 @@ extern "C" {
# define OVERALLOCATE_FACTOR 4
#endif
-/* This dictionary holds all interned unicode strings. Note that references
- to strings in this dictionary are *not* counted in the string's ob_refcnt.
- When the interned string reaches a refcnt of 0 the string deallocation
- function will delete the reference from this dictionary.
-
- Another way to look at this is that to say that the actual reference
- count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
-*/
-static PyObject *interned = NULL;
-
/* Forward declaration */
static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
@@ -259,8 +222,89 @@ static inline PyObject* unicode_get_empty(void)
static inline PyObject* unicode_new_empty(void)
{
PyObject *empty = unicode_get_empty();
- Py_INCREF(empty);
- return empty;
+ return Py_NewRef(empty);
+}
+
+/* This dictionary holds all interned unicode strings. Note that references
+ to strings in this dictionary are *not* counted in the string's ob_refcnt.
+ When the interned string reaches a refcnt of 0 the string deallocation
+ function will delete the reference from this dictionary.
+*/
+static inline PyObject *get_interned_dict(PyInterpreterState *interp)
+{
+ return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
+}
+
+#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
+
+Py_ssize_t
+_PyUnicode_InternedSize(void)
+{
+ PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
+ return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
+}
+
+static Py_hash_t unicode_hash(PyObject *);
+static int unicode_compare_eq(PyObject *, PyObject *);
+
+static Py_uhash_t
+hashtable_unicode_hash(const void *key)
+{
+ return unicode_hash((PyObject *)key);
+}
+
+static int
+hashtable_unicode_compare(const void *key1, const void *key2)
+{
+ PyObject *obj1 = (PyObject *)key1;
+ PyObject *obj2 = (PyObject *)key2;
+ if (obj1 != NULL && obj2 != NULL) {
+ return unicode_compare_eq(obj1, obj2);
+ }
+ else {
+ return obj1 == obj2;
+ }
+}
+
+static int
+init_interned_dict(PyInterpreterState *interp)
+{
+ if (_Py_IsMainInterpreter(interp)) {
+ assert(INTERNED_STRINGS == NULL);
+ _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
+ INTERNED_STRINGS = _Py_hashtable_new_full(
+ hashtable_unicode_hash,
+ hashtable_unicode_compare,
+ NULL,
+ NULL,
+ &hashtable_alloc
+ );
+ if (INTERNED_STRINGS == NULL) {
+ return -1;
+ }
+ }
+ assert(get_interned_dict(interp) == NULL);
+ PyObject *interned = interned = PyDict_New();
+ if (interned == NULL) {
+ return -1;
+ }
+ _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
+ return 0;
+}
+
+static void
+clear_interned_dict(PyInterpreterState *interp)
+{
+ PyObject *interned = get_interned_dict(interp);
+ if (interned != NULL) {
+ PyDict_Clear(interned);
+ Py_DECREF(interned);
+ _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
+ }
+ if (_Py_IsMainInterpreter(interp) && INTERNED_STRINGS != NULL) {
+ _Py_hashtable_destroy(INTERNED_STRINGS);
+ INTERNED_STRINGS = NULL;
+ }
}
#define _Py_RETURN_UNICODE_EMPTY() \
@@ -269,11 +313,10 @@ static inline PyObject* unicode_new_empty(void)
} while (0)
static inline void
-unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
+unicode_fill(int kind, void *data, Py_UCS4 value,
Py_ssize_t start, Py_ssize_t length)
{
assert(0 <= start);
- assert(kind != PyUnicode_WCHAR_KIND);
switch (kind) {
case PyUnicode_1BYTE_KIND: {
assert(value <= 0xff);
@@ -335,7 +378,6 @@ const unsigned char _Py_ascii_whitespace[] = {
};
/* forward */
-static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
static PyObject* get_latin1_char(unsigned char ch);
static int unicode_modifiable(PyObject *unicode);
@@ -478,7 +520,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors)
return 0;
}
- if (encoding != NULL) {
+ if (encoding != NULL
+ // Fast path for the most common built-in encodings. Even if the codec
+ // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
+ // create a temporary Unicode string (the key in the cache).
+ && strcmp(encoding, "utf-8") != 0
+ && strcmp(encoding, "utf8") != 0
+ && strcmp(encoding, "ascii") != 0)
+ {
PyObject *handler = _PyCodec_Lookup(encoding);
if (handler == NULL) {
return -1;
@@ -486,7 +535,14 @@ unicode_check_encoding_errors(const char *encoding, const char *errors)
Py_DECREF(handler);
}
- if (errors != NULL) {
+ if (errors != NULL
+ // Fast path for the most common built-in error handlers.
+ && strcmp(errors, "strict") != 0
+ && strcmp(errors, "ignore") != 0
+ && strcmp(errors, "replace") != 0
+ && strcmp(errors, "surrogateescape") != 0
+ && strcmp(errors, "surrogatepass") != 0)
+ {
PyObject *handler = PyCodec_LookupError(errors);
if (handler == NULL) {
return -1;
@@ -507,11 +563,10 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
CHECK(PyUnicode_Check(op));
PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
- unsigned int kind = ascii->state.kind;
+ int kind = ascii->state.kind;
if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
CHECK(kind == PyUnicode_1BYTE_KIND);
- CHECK(ascii->state.ready == 1);
}
else {
PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
@@ -523,62 +578,32 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
CHECK(ascii->state.ascii == 0);
- CHECK(ascii->state.ready == 1);
CHECK(compact->utf8 != data);
}
else {
PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
data = unicode->data.any;
- if (kind == PyUnicode_WCHAR_KIND) {
- CHECK(ascii->length == 0);
- CHECK(ascii->hash == -1);
- CHECK(ascii->state.compact == 0);
- CHECK(ascii->state.ascii == 0);
- CHECK(ascii->state.ready == 0);
- CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
- CHECK(ascii->wstr != NULL);
- CHECK(data == NULL);
- CHECK(compact->utf8 == NULL);
+ CHECK(kind == PyUnicode_1BYTE_KIND
+ || kind == PyUnicode_2BYTE_KIND
+ || kind == PyUnicode_4BYTE_KIND);
+ CHECK(ascii->state.compact == 0);
+ CHECK(data != NULL);
+ if (ascii->state.ascii) {
+ CHECK(compact->utf8 == data);
+ CHECK(compact->utf8_length == ascii->length);
}
else {
- CHECK(kind == PyUnicode_1BYTE_KIND
- || kind == PyUnicode_2BYTE_KIND
- || kind == PyUnicode_4BYTE_KIND);
- CHECK(ascii->state.compact == 0);
- CHECK(ascii->state.ready == 1);
- CHECK(data != NULL);
- if (ascii->state.ascii) {
- CHECK(compact->utf8 == data);
- CHECK(compact->utf8_length == ascii->length);
- }
- else
- CHECK(compact->utf8 != data);
+ CHECK(compact->utf8 != data);
}
}
- if (kind != PyUnicode_WCHAR_KIND) {
- if (
-#if SIZEOF_WCHAR_T == 2
- kind == PyUnicode_2BYTE_KIND
-#else
- kind == PyUnicode_4BYTE_KIND
-#endif
- )
- {
- CHECK(ascii->wstr == data);
- CHECK(compact->wstr_length == ascii->length);
- } else
- CHECK(ascii->wstr != data);
- }
if (compact->utf8 == NULL)
CHECK(compact->utf8_length == 0);
- if (ascii->wstr == NULL)
- CHECK(compact->wstr_length == 0);
}
/* check that the best kind is used: O(n) operation */
- if (check_content && kind != PyUnicode_WCHAR_KIND) {
+ if (check_content) {
Py_ssize_t i;
Py_UCS4 maxchar = 0;
const void *data;
@@ -614,47 +639,12 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
#undef CHECK
}
-
static PyObject*
-unicode_result_wchar(PyObject *unicode)
-{
-#ifndef Py_DEBUG
- Py_ssize_t len;
-
- len = _PyUnicode_WSTR_LENGTH(unicode);
- if (len == 0) {
- Py_DECREF(unicode);
- _Py_RETURN_UNICODE_EMPTY();
- }
-
- if (len == 1) {
- wchar_t ch = _PyUnicode_WSTR(unicode)[0];
- if ((Py_UCS4)ch < 256) {
- Py_DECREF(unicode);
- return get_latin1_char((unsigned char)ch);
- }
- }
-
- if (_PyUnicode_Ready(unicode) < 0) {
- Py_DECREF(unicode);
- return NULL;
- }
-#else
- assert(Py_REFCNT(unicode) == 1);
-
- /* don't make the result ready in debug mode to ensure that the caller
- makes the string ready before using it */
- assert(_PyUnicode_CheckConsistency(unicode, 1));
-#endif
- return unicode;
-}
-
-static PyObject*
-unicode_result_ready(PyObject *unicode)
+unicode_result(PyObject *unicode)
{
- Py_ssize_t length;
+ assert(_PyUnicode_CHECK(unicode));
- length = PyUnicode_GET_LENGTH(unicode);
+ Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
if (length == 0) {
PyObject *empty = unicode_get_empty();
if (unicode != empty) {
@@ -683,23 +673,10 @@ unicode_result_ready(PyObject *unicode)
}
static PyObject*
-unicode_result(PyObject *unicode)
-{
- assert(_PyUnicode_CHECK(unicode));
- if (PyUnicode_IS_READY(unicode))
- return unicode_result_ready(unicode);
- else
- return unicode_result_wchar(unicode);
-}
-
-static PyObject*
unicode_result_unchanged(PyObject *unicode)
{
if (PyUnicode_CheckExact(unicode)) {
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
- Py_INCREF(unicode);
- return unicode;
+ return Py_NewRef(unicode);
}
else
/* Subtype -- return genuine unicode string with the same value. */
@@ -714,10 +691,9 @@ backslashreplace(_PyBytesWriter *writer, char *str,
{
Py_ssize_t size, i;
Py_UCS4 ch;
- enum PyUnicode_Kind kind;
+ int kind;
const void *data;
- assert(PyUnicode_IS_READY(unicode));
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
@@ -781,10 +757,9 @@ xmlcharrefreplace(_PyBytesWriter *writer, char *str,
{
Py_ssize_t size, i;
Py_UCS4 ch;
- enum PyUnicode_Kind kind;
+ int kind;
const void *data;
- assert(PyUnicode_IS_READY(unicode));
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
@@ -908,7 +883,7 @@ ensure_unicode(PyObject *obj)
Py_TYPE(obj)->tp_name);
return -1;
}
- return PyUnicode_READY(obj);
+ return 0;
}
/* Compilation of templated routines */
@@ -954,15 +929,6 @@ ensure_unicode(PyObject *obj)
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
-_Py_COMP_DIAG_PUSH
-_Py_COMP_DIAG_IGNORE_DEPR_DECLS
-#include "stringlib/unicodedefs.h"
-#include "stringlib/fastsearch.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/undef.h"
-_Py_COMP_DIAG_POP
-
#undef STRINGLIB_GET_EMPTY
/* --- Unicode Object ----------------------------------------------------- */
@@ -1022,14 +988,12 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
Py_ssize_t char_size;
Py_ssize_t struct_size;
Py_ssize_t new_size;
- int share_wstr;
PyObject *new_unicode;
#ifdef Py_DEBUG
Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
#endif
assert(unicode_modifiable(unicode));
- assert(PyUnicode_IS_READY(unicode));
assert(PyUnicode_IS_COMPACT(unicode));
char_size = PyUnicode_KIND(unicode);
@@ -1037,7 +1001,6 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
struct_size = sizeof(PyASCIIObject);
else
struct_size = sizeof(PyCompactUnicodeObject);
- share_wstr = _PyUnicode_SHARE_WSTR(unicode);
if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
PyErr_NoMemory();
@@ -1050,34 +1013,20 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
}
-#ifdef Py_REF_DEBUG
- _Py_RefTotal--;
-#endif
#ifdef Py_TRACE_REFS
_Py_ForgetReference(unicode);
#endif
new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
if (new_unicode == NULL) {
- _Py_NewReference(unicode);
+ _Py_NewReferenceNoTotal(unicode);
PyErr_NoMemory();
return NULL;
}
unicode = new_unicode;
- _Py_NewReference(unicode);
+ _Py_NewReferenceNoTotal(unicode);
_PyUnicode_LENGTH(unicode) = length;
- if (share_wstr) {
- _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
- if (!PyUnicode_IS_ASCII(unicode))
- _PyUnicode_WSTR_LENGTH(unicode) = length;
- }
- else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
- PyObject_Free(_PyUnicode_WSTR(unicode));
- _PyUnicode_WSTR(unicode) = NULL;
- if (!PyUnicode_IS_ASCII(unicode))
- _PyUnicode_WSTR_LENGTH(unicode) = 0;
- }
#ifdef Py_DEBUG
unicode_fill_invalid(unicode, old_length);
#endif
@@ -1090,78 +1039,55 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
static int
resize_inplace(PyObject *unicode, Py_ssize_t length)
{
- wchar_t *wstr;
- Py_ssize_t new_size;
assert(!PyUnicode_IS_COMPACT(unicode));
assert(Py_REFCNT(unicode) == 1);
- if (PyUnicode_IS_READY(unicode)) {
- Py_ssize_t char_size;
- int share_wstr, share_utf8;
- void *data;
+ Py_ssize_t new_size;
+ Py_ssize_t char_size;
+ int share_utf8;
+ void *data;
#ifdef Py_DEBUG
- Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
+ Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
#endif
- data = _PyUnicode_DATA_ANY(unicode);
- char_size = PyUnicode_KIND(unicode);
- share_wstr = _PyUnicode_SHARE_WSTR(unicode);
- share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
+ data = _PyUnicode_DATA_ANY(unicode);
+ char_size = PyUnicode_KIND(unicode);
+ share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
- if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
- PyErr_NoMemory();
- return -1;
- }
- new_size = (length + 1) * char_size;
+ if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ new_size = (length + 1) * char_size;
- if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
- {
- PyObject_Free(_PyUnicode_UTF8(unicode));
- _PyUnicode_UTF8(unicode) = NULL;
- _PyUnicode_UTF8_LENGTH(unicode) = 0;
- }
+ if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
+ {
+ PyObject_Free(_PyUnicode_UTF8(unicode));
+ _PyUnicode_UTF8(unicode) = NULL;
+ _PyUnicode_UTF8_LENGTH(unicode) = 0;
+ }
- data = (PyObject *)PyObject_Realloc(data, new_size);
- if (data == NULL) {
- PyErr_NoMemory();
- return -1;
- }
- _PyUnicode_DATA_ANY(unicode) = data;
- if (share_wstr) {
- _PyUnicode_WSTR(unicode) = data;
- _PyUnicode_WSTR_LENGTH(unicode) = length;
- }
- if (share_utf8) {
- _PyUnicode_UTF8(unicode) = data;
- _PyUnicode_UTF8_LENGTH(unicode) = length;
- }
- _PyUnicode_LENGTH(unicode) = length;
- PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
+ data = (PyObject *)PyObject_Realloc(data, new_size);
+ if (data == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ _PyUnicode_DATA_ANY(unicode) = data;
+ if (share_utf8) {
+ _PyUnicode_UTF8(unicode) = data;
+ _PyUnicode_UTF8_LENGTH(unicode) = length;
+ }
+ _PyUnicode_LENGTH(unicode) = length;
+ PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
#ifdef Py_DEBUG
- unicode_fill_invalid(unicode, old_length);
+ unicode_fill_invalid(unicode, old_length);
#endif
- if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
- assert(_PyUnicode_CheckConsistency(unicode, 0));
- return 0;
- }
- }
- assert(_PyUnicode_WSTR(unicode) != NULL);
/* check for integer overflow */
if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
PyErr_NoMemory();
return -1;
}
- new_size = sizeof(wchar_t) * (length + 1);
- wstr = _PyUnicode_WSTR(unicode);
- wstr = PyObject_Realloc(wstr, new_size);
- if (!wstr) {
- PyErr_NoMemory();
- return -1;
- }
- _PyUnicode_WSTR(unicode) = wstr;
- _PyUnicode_WSTR(unicode)[length] = 0;
- _PyUnicode_WSTR_LENGTH(unicode) = length;
assert(_PyUnicode_CheckConsistency(unicode, 0));
return 0;
}
@@ -1170,99 +1096,15 @@ static PyObject*
resize_copy(PyObject *unicode, Py_ssize_t length)
{
Py_ssize_t copy_length;
- if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
- PyObject *copy;
-
- assert(PyUnicode_IS_READY(unicode));
-
- copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
- if (copy == NULL)
- return NULL;
-
- copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
- _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
- return copy;
- }
- else {
- PyObject *w;
-
- w = (PyObject*)_PyUnicode_New(length);
- if (w == NULL)
- return NULL;
- copy_length = _PyUnicode_WSTR_LENGTH(unicode);
- copy_length = Py_MIN(copy_length, length);
- memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
- copy_length * sizeof(wchar_t));
- return w;
- }
-}
-
-/* We allocate one more byte to make sure the string is
- Ux0000 terminated; some code (e.g. new_identifier)
- relies on that.
-
- XXX This allocator could further be enhanced by assuring that the
- free list never reduces its size below 1.
-
-*/
-
-static PyUnicodeObject *
-_PyUnicode_New(Py_ssize_t length)
-{
- PyUnicodeObject *unicode;
- size_t new_size;
-
- /* Optimization for empty strings */
- if (length == 0) {
- return (PyUnicodeObject *)unicode_new_empty();
- }
-
- /* Ensure we won't overflow the size. */
- if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
- return (PyUnicodeObject *)PyErr_NoMemory();
- }
- if (length < 0) {
- PyErr_SetString(PyExc_SystemError,
- "Negative size passed to _PyUnicode_New");
- return NULL;
- }
-
- unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
- if (unicode == NULL)
- return NULL;
- new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
-
- _PyUnicode_WSTR_LENGTH(unicode) = length;
- _PyUnicode_HASH(unicode) = -1;
- _PyUnicode_STATE(unicode).interned = 0;
- _PyUnicode_STATE(unicode).kind = 0;
- _PyUnicode_STATE(unicode).compact = 0;
- _PyUnicode_STATE(unicode).ready = 0;
- _PyUnicode_STATE(unicode).ascii = 0;
- _PyUnicode_DATA_ANY(unicode) = NULL;
- _PyUnicode_LENGTH(unicode) = 0;
- _PyUnicode_UTF8(unicode) = NULL;
- _PyUnicode_UTF8_LENGTH(unicode) = 0;
+ PyObject *copy;
- _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
- if (!_PyUnicode_WSTR(unicode)) {
- Py_DECREF(unicode);
- PyErr_NoMemory();
+ copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
+ if (copy == NULL)
return NULL;
- }
-
- /* Initialize the first element to guard against cases where
- * the caller fails before initializing str -- unicode_resize()
- * reads str[0], and the Keep-Alive optimization can keep memory
- * allocated for str alive across a call to unicode_dealloc(unicode).
- * We don't want unicode_resize to read uninitialized memory in
- * that case.
- */
- _PyUnicode_WSTR(unicode)[0] = 0;
- _PyUnicode_WSTR(unicode)[length] = 0;
- assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
- return unicode;
+ copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
+ _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
+ return copy;
}
static const char*
@@ -1272,8 +1114,6 @@ unicode_kind_name(PyObject *unicode)
_PyUnicode_Dump() */
if (!PyUnicode_IS_COMPACT(unicode))
{
- if (!PyUnicode_IS_READY(unicode))
- return "wstr";
switch (PyUnicode_KIND(unicode))
{
case PyUnicode_1BYTE_KIND:
@@ -1289,7 +1129,6 @@ unicode_kind_name(PyObject *unicode)
return "<legacy invalid kind>";
}
}
- assert(PyUnicode_IS_READY(unicode));
switch (PyUnicode_KIND(unicode)) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(unicode))
@@ -1346,15 +1185,7 @@ _PyUnicode_Dump(PyObject *op)
data = unicode->data.any;
printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
- if (ascii->wstr == data)
- printf("shared ");
- printf("wstr=%p", (void *)ascii->wstr);
-
- if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
- printf(" (%zu), ", compact->wstr_length);
- if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
- printf("shared ");
- }
+ if (!ascii->state.ascii) {
printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
}
printf(", data=%p\n", data);
@@ -1373,13 +1204,12 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
PyObject *obj;
PyCompactUnicodeObject *unicode;
void *data;
- enum PyUnicode_Kind kind;
- int is_sharing, is_ascii;
+ int kind;
+ int is_ascii;
Py_ssize_t char_size;
Py_ssize_t struct_size;
is_ascii = 0;
- is_sharing = 0;
struct_size = sizeof(PyCompactUnicodeObject);
if (maxchar < 128) {
kind = PyUnicode_1BYTE_KIND;
@@ -1394,8 +1224,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
else if (maxchar < 65536) {
kind = PyUnicode_2BYTE_KIND;
char_size = 2;
- if (sizeof(wchar_t) == 2)
- is_sharing = 1;
}
else {
if (maxchar > MAX_UNICODE) {
@@ -1405,8 +1233,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
}
kind = PyUnicode_4BYTE_KIND;
char_size = 4;
- if (sizeof(wchar_t) == 4)
- is_sharing = 1;
}
/* Ensure we won't overflow the size. */
@@ -1438,16 +1264,13 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
_PyUnicode_STATE(unicode).interned = 0;
_PyUnicode_STATE(unicode).kind = kind;
_PyUnicode_STATE(unicode).compact = 1;
- _PyUnicode_STATE(unicode).ready = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii;
+ _PyUnicode_STATE(unicode).statically_allocated = 0;
if (is_ascii) {
((char*)data)[size] = 0;
- _PyUnicode_WSTR(unicode) = NULL;
}
else if (kind == PyUnicode_1BYTE_KIND) {
((char*)data)[size] = 0;
- _PyUnicode_WSTR(unicode) = NULL;
- _PyUnicode_WSTR_LENGTH(unicode) = 0;
unicode->utf8 = NULL;
unicode->utf8_length = 0;
}
@@ -1458,14 +1281,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
((Py_UCS2*)data)[size] = 0;
else /* kind == PyUnicode_4BYTE_KIND */
((Py_UCS4*)data)[size] = 0;
- if (is_sharing) {
- _PyUnicode_WSTR_LENGTH(unicode) = size;
- _PyUnicode_WSTR(unicode) = (wchar_t *)data;
- }
- else {
- _PyUnicode_WSTR_LENGTH(unicode) = 0;
- _PyUnicode_WSTR(unicode) = NULL;
- }
}
#ifdef Py_DEBUG
unicode_fill_invalid((PyObject*)unicode, 0);
@@ -1530,7 +1345,7 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
PyObject *from, Py_ssize_t from_start,
Py_ssize_t how_many, int check_maxchar)
{
- unsigned int from_kind, to_kind;
+ int from_kind, to_kind;
const void *from_data;
void *to_data;
@@ -1538,11 +1353,9 @@ _copy_characters(PyObject *to, Py_ssize_t to_start,
assert(0 <= from_start);
assert(0 <= to_start);
assert(PyUnicode_Check(from));
- assert(PyUnicode_IS_READY(from));
assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
assert(PyUnicode_Check(to));
- assert(PyUnicode_IS_READY(to));
assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
if (how_many == 0)
@@ -1687,11 +1500,6 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
return -1;
}
- if (PyUnicode_READY(from) == -1)
- return -1;
- if (PyUnicode_READY(to) == -1)
- return -1;
-
if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return -1;
@@ -1776,135 +1584,6 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
return 0;
}
-int
-_PyUnicode_Ready(PyObject *unicode)
-{
- wchar_t *end;
- Py_UCS4 maxchar = 0;
- Py_ssize_t num_surrogates;
-#if SIZEOF_WCHAR_T == 2
- Py_ssize_t length_wo_surrogates;
-#endif
-
- /* _PyUnicode_Ready() is only intended for old-style API usage where
- strings were created using _PyObject_New() and where no canonical
- representation (the str field) has been set yet aka strings
- which are not yet ready. */
- assert(_PyUnicode_CHECK(unicode));
- assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
- assert(_PyUnicode_WSTR(unicode) != NULL);
- assert(_PyUnicode_DATA_ANY(unicode) == NULL);
- assert(_PyUnicode_UTF8(unicode) == NULL);
- /* Actually, it should neither be interned nor be anything else: */
- assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
-
- end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
- if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
- &maxchar, &num_surrogates) == -1)
- return -1;
-
- if (maxchar < 256) {
- _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
- if (!_PyUnicode_DATA_ANY(unicode)) {
- PyErr_NoMemory();
- return -1;
- }
- _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
- _PyUnicode_WSTR(unicode), end,
- PyUnicode_1BYTE_DATA(unicode));
- PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
- _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
- _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
- if (maxchar < 128) {
- _PyUnicode_STATE(unicode).ascii = 1;
- _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
- _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
- }
- else {
- _PyUnicode_STATE(unicode).ascii = 0;
- _PyUnicode_UTF8(unicode) = NULL;
- _PyUnicode_UTF8_LENGTH(unicode) = 0;
- }
- PyObject_Free(_PyUnicode_WSTR(unicode));
- _PyUnicode_WSTR(unicode) = NULL;
- _PyUnicode_WSTR_LENGTH(unicode) = 0;
- }
- /* In this case we might have to convert down from 4-byte native
- wchar_t to 2-byte unicode. */
- else if (maxchar < 65536) {
- assert(num_surrogates == 0 &&
- "FindMaxCharAndNumSurrogatePairs() messed up");
-
-#if SIZEOF_WCHAR_T == 2
- /* We can share representations and are done. */
- _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
- PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
- _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
- _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
- _PyUnicode_UTF8(unicode) = NULL;
- _PyUnicode_UTF8_LENGTH(unicode) = 0;
-#else
- /* sizeof(wchar_t) == 4 */
- _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
- 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
- if (!_PyUnicode_DATA_ANY(unicode)) {
- PyErr_NoMemory();
- return -1;
- }
- _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
- _PyUnicode_WSTR(unicode), end,
- PyUnicode_2BYTE_DATA(unicode));
- PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
- _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
- _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
- _PyUnicode_UTF8(unicode) = NULL;
- _PyUnicode_UTF8_LENGTH(unicode) = 0;
- PyObject_Free(_PyUnicode_WSTR(unicode));
- _PyUnicode_WSTR(unicode) = NULL;
- _PyUnicode_WSTR_LENGTH(unicode) = 0;
-#endif
- }
- /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
- else {
-#if SIZEOF_WCHAR_T == 2
- /* in case the native representation is 2-bytes, we need to allocate a
- new normalized 4-byte version. */
- length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
- if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
- PyErr_NoMemory();
- return -1;
- }
- _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
- if (!_PyUnicode_DATA_ANY(unicode)) {
- PyErr_NoMemory();
- return -1;
- }
- _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
- _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
- _PyUnicode_UTF8(unicode) = NULL;
- _PyUnicode_UTF8_LENGTH(unicode) = 0;
- /* unicode_convert_wchar_to_ucs4() requires a ready string */
- _PyUnicode_STATE(unicode).ready = 1;
- unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
- PyObject_Free(_PyUnicode_WSTR(unicode));
- _PyUnicode_WSTR(unicode) = NULL;
- _PyUnicode_WSTR_LENGTH(unicode) = 0;
-#else
- assert(num_surrogates == 0);
-
- _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
- _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
- _PyUnicode_UTF8(unicode) = NULL;
- _PyUnicode_UTF8_LENGTH(unicode) = 0;
- _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
-#endif
- PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
- }
- _PyUnicode_STATE(unicode).ready = 1;
- assert(_PyUnicode_CheckConsistency(unicode, 1));
- return 0;
-}
-
static void
unicode_dealloc(PyObject *unicode)
{
@@ -1913,38 +1592,15 @@ unicode_dealloc(PyObject *unicode)
_Py_FatalRefcountError("deallocating an Unicode singleton");
}
#endif
-
- switch (PyUnicode_CHECK_INTERNED(unicode)) {
- case SSTATE_NOT_INTERNED:
- break;
- case SSTATE_INTERNED_MORTAL:
+ /* This should never get called, but we also don't want to SEGV if
+ * we accidentally decref an immortal string out of existence. Since
+ * the string is an immortal object, just re-set the reference count.
+ */
+ if (PyUnicode_CHECK_INTERNED(unicode)
+ || _PyUnicode_STATE(unicode).statically_allocated)
{
- /* Revive the dead object temporarily. PyDict_DelItem() removes two
- references (key and value) which were ignored by
- PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
- to prevent calling unicode_dealloc() again. Adjust refcnt after
- PyDict_DelItem(). */
- assert(Py_REFCNT(unicode) == 0);
- Py_SET_REFCNT(unicode, 3);
- if (PyDict_DelItem(interned, unicode) != 0) {
- _PyErr_WriteUnraisableMsg("deletion of interned string failed",
- NULL);
- }
- assert(Py_REFCNT(unicode) == 1);
- Py_SET_REFCNT(unicode, 0);
- break;
- }
-
- case SSTATE_INTERNED_IMMORTAL:
- _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
- break;
-
- default:
- Py_UNREACHABLE();
- }
-
- if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
- PyObject_Free(_PyUnicode_WSTR(unicode));
+ _Py_SetImmortal(unicode);
+ return;
}
if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
PyObject_Free(_PyUnicode_UTF8(unicode));
@@ -1965,7 +1621,7 @@ unicode_is_singleton(PyObject *unicode)
}
PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
- if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
+ if (ascii->length == 1) {
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
if (ch < 256 && LATIN1(ch) == unicode) {
return 1;
@@ -2007,10 +1663,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
assert(PyUnicode_Check(unicode));
assert(0 <= length);
- if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
- old_length = PyUnicode_WSTR_LENGTH(unicode);
- else
- old_length = PyUnicode_GET_LENGTH(unicode);
+ old_length = PyUnicode_GET_LENGTH(unicode);
if (old_length == length)
return 0;
@@ -2064,7 +1717,7 @@ static void
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
const char *str, Py_ssize_t len)
{
- enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+ int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
const char *end = str + len;
@@ -2140,28 +1793,6 @@ unicode_char(Py_UCS4 ch)
}
PyObject *
-PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
-{
- if (u == NULL) {
- if (size > 0) {
- if (PyErr_WarnEx(PyExc_DeprecationWarning,
- "PyUnicode_FromUnicode(NULL, size) is deprecated; "
- "use PyUnicode_New() instead", 1) < 0) {
- return NULL;
- }
- }
- return (PyObject*)_PyUnicode_New(size);
- }
-
- if (size < 0) {
- PyErr_BadInternalCall();
- return NULL;
- }
-
- return PyUnicode_FromWideChar(u, size);
-}
-
-PyObject *
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
{
PyObject *unicode;
@@ -2254,16 +1885,12 @@ PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
if (u != NULL) {
return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
}
- else {
- if (size > 0) {
- if (PyErr_WarnEx(PyExc_DeprecationWarning,
- "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
- "use PyUnicode_New() instead", 1) < 0) {
- return NULL;
- }
- }
- return (PyObject *)_PyUnicode_New(size);
+ if (size > 0) {
+ PyErr_SetString(PyExc_SystemError,
+ "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
+ return NULL;
}
+ return unicode_new_empty();
}
PyObject *
@@ -2291,7 +1918,7 @@ _PyUnicode_FromId(_Py_Identifier *id)
Py_ssize_t index = _Py_atomic_size_get(&id->index);
if (index < 0) {
- struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
+ struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
// Check again to detect concurrent access. Another thread can have
@@ -2382,7 +2009,7 @@ _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
}
static Py_UCS4
-kind_maxchar_limit(unsigned int kind)
+kind_maxchar_limit(int kind)
{
switch (kind) {
case PyUnicode_1BYTE_KIND:
@@ -2496,10 +2123,9 @@ PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
Py_UCS4
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
{
- enum PyUnicode_Kind kind;
+ int kind;
const void *startptr, *endptr;
- assert(PyUnicode_IS_READY(unicode));
assert(0 <= start);
assert(end <= PyUnicode_GET_LENGTH(unicode));
assert(start <= end);
@@ -2538,11 +2164,10 @@ unicode_adjust_maxchar(PyObject **p_unicode)
PyObject *unicode, *copy;
Py_UCS4 max_char;
Py_ssize_t len;
- unsigned int kind;
+ int kind;
assert(p_unicode != NULL);
unicode = *p_unicode;
- assert(PyUnicode_IS_READY(unicode));
if (PyUnicode_IS_ASCII(unicode))
return;
@@ -2586,8 +2211,6 @@ _PyUnicode_Copy(PyObject *unicode)
PyErr_BadInternalCall();
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(unicode);
copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
@@ -2606,7 +2229,7 @@ _PyUnicode_Copy(PyObject *unicode)
character. Return NULL on error. */
static void*
-unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
+unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
{
void *result;
@@ -2656,8 +2279,6 @@ as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
int kind;
const void *data;
Py_ssize_t len, targetlen;
- if (PyUnicode_READY(string) == -1)
- return NULL;
kind = PyUnicode_KIND(string);
data = PyUnicode_DATA(string);
len = PyUnicode_GET_LENGTH(string);
@@ -2716,21 +2337,19 @@ PyUnicode_AsUCS4Copy(PyObject *string)
return as_ucs4(string, NULL, 0, 1);
}
-/* maximum number of characters required for output of %lld or %p.
- We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
- plus 1 for the sign. 53/22 is an upper bound for log10(256). */
-#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
+/* maximum number of characters required for output of %jo or %jd or %p.
+ We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
+ plus 1 for the sign, plus 2 for the 0x prefix (for %p),
+ plus 1 for the terminal NUL. */
+#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
static int
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
- Py_ssize_t width, Py_ssize_t precision)
+ Py_ssize_t width, Py_ssize_t precision, int flags)
{
Py_ssize_t length, fill, arglen;
Py_UCS4 maxchar;
- if (PyUnicode_READY(str) == -1)
- return -1;
-
length = PyUnicode_GET_LENGTH(str);
if ((precision == -1 || precision >= length)
&& width <= length)
@@ -2748,8 +2367,8 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
return -1;
- if (width > length) {
- fill = width - length;
+ fill = Py_MAX(width - length, 0);
+ if (fill && !(flags & F_LJUST)) {
if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
return -1;
writer->pos += fill;
@@ -2758,12 +2377,19 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
str, 0, length);
writer->pos += length;
+
+ if (fill && (flags & F_LJUST)) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
+ return -1;
+ writer->pos += fill;
+ }
+
return 0;
}
static int
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
- Py_ssize_t width, Py_ssize_t precision)
+ Py_ssize_t width, Py_ssize_t precision, int flags)
{
/* UTF-8 */
Py_ssize_t length;
@@ -2783,36 +2409,93 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
if (unicode == NULL)
return -1;
- res = unicode_fromformat_write_str(writer, unicode, width, -1);
+ res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
Py_DECREF(unicode);
return res;
}
+static int
+unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
+ Py_ssize_t width, Py_ssize_t precision, int flags)
+{
+ /* UTF-8 */
+ Py_ssize_t length;
+ PyObject *unicode;
+ int res;
+
+ if (precision == -1) {
+ length = wcslen(str);
+ }
+ else {
+ length = 0;
+ while (length < precision && str[length]) {
+ length++;
+ }
+ }
+ unicode = PyUnicode_FromWideChar(str, length);
+ if (unicode == NULL)
+ return -1;
+
+ res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
+ Py_DECREF(unicode);
+ return res;
+}
+
+#define F_LONG 1
+#define F_LONGLONG 2
+#define F_SIZE 3
+#define F_PTRDIFF 4
+#define F_INTMAX 5
+static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"};
+static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"};
+static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"};
+static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"};
+static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"};
+
static const char*
unicode_fromformat_arg(_PyUnicodeWriter *writer,
const char *f, va_list *vargs)
{
const char *p;
Py_ssize_t len;
- int zeropad;
+ int flags = 0;
Py_ssize_t width;
Py_ssize_t precision;
- int longflag;
- int longlongflag;
- int size_tflag;
- Py_ssize_t fill;
p = f;
f++;
- zeropad = 0;
- if (*f == '0') {
- zeropad = 1;
+ if (*f == '%') {
+ if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
+ return NULL;
f++;
+ return f;
+ }
+
+ /* Parse flags. Example: "%-i" => flags=F_LJUST. */
+ /* Flags '+', ' ' and '#' are not particularly useful.
+ * They are not worth the implementation and maintenance costs.
+ * In addition, '#' should add "0" for "o" conversions for compatibility
+ * with printf, but it would confuse Python users. */
+ while (1) {
+ switch (*f++) {
+ case '-': flags |= F_LJUST; continue;
+ case '0': flags |= F_ZERO; continue;
+ }
+ f--;
+ break;
}
/* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
width = -1;
- if (Py_ISDIGIT((unsigned)*f)) {
+ if (*f == '*') {
+ width = va_arg(*vargs, int);
+ if (width < 0) {
+ flags |= F_LJUST;
+ width = -width;
+ }
+ f++;
+ }
+ else if (Py_ISDIGIT((unsigned)*f)) {
width = *f - '0';
f++;
while (Py_ISDIGIT((unsigned)*f)) {
@@ -2828,7 +2511,14 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
precision = -1;
if (*f == '.') {
f++;
- if (Py_ISDIGIT((unsigned)*f)) {
+ if (*f == '*') {
+ precision = va_arg(*vargs, int);
+ if (precision < 0) {
+ precision = -2;
+ }
+ f++;
+ }
+ else if (Py_ISDIGIT((unsigned)*f)) {
precision = (*f - '0');
f++;
while (Py_ISDIGIT((unsigned)*f)) {
@@ -2841,41 +2531,50 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
f++;
}
}
- if (*f == '%') {
- /* "%.3%s" => f points to "3" */
- f--;
- }
- }
- if (*f == '\0') {
- /* bogus format "%.123" => go backward, f points to "3" */
- f--;
}
- /* Handle %ld, %lu, %lld and %llu. */
- longflag = 0;
- longlongflag = 0;
- size_tflag = 0;
+ int sizemod = 0;
if (*f == 'l') {
- if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
- longflag = 1;
- ++f;
- }
- else if (f[1] == 'l' &&
- (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
- longlongflag = 1;
+ if (f[1] == 'l') {
+ sizemod = F_LONGLONG;
f += 2;
}
+ else {
+ sizemod = F_LONG;
+ ++f;
+ }
}
- /* handle the size_t flag. */
- else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
- size_tflag = 1;
+ else if (*f == 'z') {
+ sizemod = F_SIZE;
++f;
}
-
- if (f[1] == '\0')
+ else if (*f == 't') {
+ sizemod = F_PTRDIFF;
+ ++f;
+ }
+ else if (*f == 'j') {
+ sizemod = F_INTMAX;
+ ++f;
+ }
+ if (f[0] != '\0' && f[1] == '\0')
writer->overallocate = 0;
switch (*f) {
+ case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
+ break;
+ case 'c': case 'p':
+ if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
+ break;
+ case 's':
+ case 'V':
+ if (sizemod && sizemod != F_LONG) goto invalid_format;
+ break;
+ default:
+ if (sizemod) goto invalid_format;
+ break;
+ }
+
+ switch (*f) {
case 'c':
{
int ordinal = va_arg(*vargs, int);
@@ -2889,78 +2588,98 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
break;
}
- case 'i':
- case 'd':
- case 'u':
- case 'x':
+ case 'd': case 'i':
+ case 'o': case 'u': case 'x': case 'X':
{
/* used by sprintf */
- char buffer[MAX_LONG_LONG_CHARS];
- Py_ssize_t arglen;
-
- if (*f == 'u') {
- if (longflag) {
- len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
- }
- else if (longlongflag) {
- len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
- }
- else if (size_tflag) {
- len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
- }
- else {
- len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
- }
+ char buffer[MAX_INTMAX_CHARS];
+ const char *fmt = NULL;
+ switch (*f) {
+ case 'o': fmt = formats_o[sizemod]; break;
+ case 'u': fmt = formats_u[sizemod]; break;
+ case 'x': fmt = formats_x[sizemod]; break;
+ case 'X': fmt = formats_X[sizemod]; break;
+ default: fmt = formats[sizemod]; break;
}
- else if (*f == 'x') {
- len = sprintf(buffer, "%x", va_arg(*vargs, int));
- }
- else {
- if (longflag) {
- len = sprintf(buffer, "%li", va_arg(*vargs, long));
- }
- else if (longlongflag) {
- len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
- }
- else if (size_tflag) {
- len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
- }
- else {
- len = sprintf(buffer, "%i", va_arg(*vargs, int));
- }
+ int issigned = (*f == 'd' || *f == 'i');
+ switch (sizemod) {
+ case F_LONG:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, long)) :
+ sprintf(buffer, fmt, va_arg(*vargs, unsigned long));
+ break;
+ case F_LONGLONG:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, long long)) :
+ sprintf(buffer, fmt, va_arg(*vargs, unsigned long long));
+ break;
+ case F_SIZE:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) :
+ sprintf(buffer, fmt, va_arg(*vargs, size_t));
+ break;
+ case F_PTRDIFF:
+ len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t));
+ break;
+ case F_INTMAX:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) :
+ sprintf(buffer, fmt, va_arg(*vargs, uintmax_t));
+ break;
+ default:
+ len = issigned ?
+ sprintf(buffer, fmt, va_arg(*vargs, int)) :
+ sprintf(buffer, fmt, va_arg(*vargs, unsigned int));
+ break;
}
assert(len >= 0);
- if (precision < len)
- precision = len;
+ int sign = (buffer[0] == '-');
+ len -= sign;
+
+ precision = Py_MAX(precision, len);
+ width = Py_MAX(width, precision + sign);
+ if ((flags & F_ZERO) && !(flags & F_LJUST)) {
+ precision = width - sign;
+ }
+
+ Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
+ Py_ssize_t zeropad = Py_MAX(precision - len, 0);
- arglen = Py_MAX(precision, width);
- if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
+ if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
return NULL;
- if (width > precision) {
- Py_UCS4 fillchar;
- fill = width - precision;
- fillchar = zeropad?'0':' ';
- if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
+ if (spacepad && !(flags & F_LJUST)) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
return NULL;
- writer->pos += fill;
+ writer->pos += spacepad;
}
- if (precision > len) {
- fill = precision - len;
- if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
+
+ if (sign) {
+ if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
+ return NULL;
+ }
+
+ if (zeropad) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
return NULL;
- writer->pos += fill;
+ writer->pos += zeropad;
}
- if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
+ if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
return NULL;
+
+ if (spacepad && (flags & F_LJUST)) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
+ return NULL;
+ writer->pos += spacepad;
+ }
break;
}
case 'p':
{
- char number[MAX_LONG_LONG_CHARS];
+ char number[MAX_INTMAX_CHARS];
len = sprintf(number, "%p", va_arg(*vargs, void*));
assert(len >= 0);
@@ -2983,10 +2702,17 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
case 's':
{
- /* UTF-8 */
- const char *s = va_arg(*vargs, const char*);
- if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
- return NULL;
+ if (sizemod) {
+ const wchar_t *s = va_arg(*vargs, const wchar_t*);
+ if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
+ return NULL;
+ }
+ else {
+ /* UTF-8 */
+ const char *s = va_arg(*vargs, const char*);
+ if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
+ return NULL;
+ }
break;
}
@@ -2995,7 +2721,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
PyObject *obj = va_arg(*vargs, PyObject *);
assert(obj && _PyUnicode_CHECK(obj));
- if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
+ if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
return NULL;
break;
}
@@ -3003,15 +2729,27 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
case 'V':
{
PyObject *obj = va_arg(*vargs, PyObject *);
- const char *str = va_arg(*vargs, const char *);
+ const char *str;
+ const wchar_t *wstr;
+ if (sizemod) {
+ wstr = va_arg(*vargs, const wchar_t*);
+ }
+ else {
+ str = va_arg(*vargs, const char *);
+ }
if (obj) {
assert(_PyUnicode_CHECK(obj));
- if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
+ if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
+ return NULL;
+ }
+ else if (sizemod) {
+ assert(wstr != NULL);
+ if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
return NULL;
}
else {
assert(str != NULL);
- if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
+ if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
return NULL;
}
break;
@@ -3025,7 +2763,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
str = PyObject_Str(obj);
if (!str)
return NULL;
- if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
+ if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
Py_DECREF(str);
return NULL;
}
@@ -3041,7 +2779,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
repr = PyObject_Repr(obj);
if (!repr)
return NULL;
- if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
+ if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
Py_DECREF(repr);
return NULL;
}
@@ -3057,7 +2795,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
ascii = PyObject_ASCII(obj);
if (!ascii)
return NULL;
- if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
+ if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
Py_DECREF(ascii);
return NULL;
}
@@ -3065,21 +2803,10 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
break;
}
- case '%':
- if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
- return NULL;
- break;
-
default:
- /* if we stumble upon an unknown formatting code, copy the rest
- of the format string to the output string. (we cannot just
- skip the code, since there's no way to know what's in the
- argument list) */
- len = strlen(p);
- if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
- return NULL;
- f = p+len;
- return f;
+ invalid_format:
+ PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
+ return NULL;
}
f++;
@@ -3149,11 +2876,7 @@ PyUnicode_FromFormat(const char *format, ...)
PyObject* ret;
va_list vargs;
-#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
-#else
- va_start(vargs);
-#endif
ret = PyUnicode_FromFormatV(format, vargs);
va_end(vargs);
return ret;
@@ -3167,13 +2890,6 @@ unicode_get_widechar_size(PyObject *unicode)
assert(unicode != NULL);
assert(_PyUnicode_CHECK(unicode));
-#if USE_UNICODE_WCHAR_CACHE
- if (_PyUnicode_WSTR(unicode) != NULL) {
- return PyUnicode_WSTR_LENGTH(unicode);
- }
-#endif /* USE_UNICODE_WCHAR_CACHE */
- assert(PyUnicode_IS_READY(unicode));
-
res = _PyUnicode_LENGTH(unicode);
#if SIZEOF_WCHAR_T == 2
if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
@@ -3195,19 +2911,10 @@ unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
assert(unicode != NULL);
assert(_PyUnicode_CHECK(unicode));
-#if USE_UNICODE_WCHAR_CACHE
- const wchar_t *wstr = _PyUnicode_WSTR(unicode);
- if (wstr != NULL) {
- memcpy(w, wstr, size * sizeof(wchar_t));
- return;
- }
-#else /* USE_UNICODE_WCHAR_CACHE */
if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
return;
}
-#endif /* USE_UNICODE_WCHAR_CACHE */
- assert(PyUnicode_IS_READY(unicode));
if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
@@ -3312,7 +3019,7 @@ PyUnicode_AsWideCharString(PyObject *unicode,
}
buflen = unicode_get_widechar_size(unicode);
- buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
+ buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
if (buffer == NULL) {
PyErr_NoMemory();
return NULL;
@@ -3348,26 +3055,16 @@ _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
{
wchar_t **p = (wchar_t **)ptr;
if (obj == NULL) {
-#if !USE_UNICODE_WCHAR_CACHE
PyMem_Free(*p);
-#endif /* USE_UNICODE_WCHAR_CACHE */
*p = NULL;
return 1;
}
if (PyUnicode_Check(obj)) {
-#if USE_UNICODE_WCHAR_CACHE
- *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
- if (*p == NULL) {
- return 0;
- }
- return 1;
-#else /* USE_UNICODE_WCHAR_CACHE */
*p = PyUnicode_AsWideCharString(obj, NULL);
if (*p == NULL) {
return 0;
}
return Py_CLEANUP_SUPPORTED;
-#endif /* USE_UNICODE_WCHAR_CACHE */
}
PyErr_Format(PyExc_TypeError,
"argument must be str, not %.50s",
@@ -3380,9 +3077,7 @@ _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
{
wchar_t **p = (wchar_t **)ptr;
if (obj == NULL) {
-#if !USE_UNICODE_WCHAR_CACHE
PyMem_Free(*p);
-#endif /* USE_UNICODE_WCHAR_CACHE */
*p = NULL;
return 1;
}
@@ -3391,19 +3086,11 @@ _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
return 1;
}
if (PyUnicode_Check(obj)) {
-#if USE_UNICODE_WCHAR_CACHE
- *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
- if (*p == NULL) {
- return 0;
- }
- return 1;
-#else /* USE_UNICODE_WCHAR_CACHE */
*p = PyUnicode_AsWideCharString(obj, NULL);
if (*p == NULL) {
return 0;
}
return Py_CLEANUP_SUPPORTED;
-#endif /* USE_UNICODE_WCHAR_CACHE */
}
PyErr_Format(PyExc_TypeError,
"argument must be str or None, not %.50s",
@@ -3429,10 +3116,7 @@ PyUnicode_FromObject(PyObject *obj)
/* XXX Perhaps we should make this API an alias of
PyObject_Str() instead ?! */
if (PyUnicode_CheckExact(obj)) {
- if (PyUnicode_READY(obj) == -1)
- return NULL;
- Py_INCREF(obj);
- return obj;
+ return Py_NewRef(obj);
}
if (PyUnicode_Check(obj)) {
/* For a Unicode subtype that's not a Unicode object,
@@ -4103,48 +3787,25 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
int
PyUnicode_FSDecoder(PyObject* arg, void* addr)
{
- int is_buffer = 0;
- PyObject *path = NULL;
- PyObject *output = NULL;
if (arg == NULL) {
Py_DECREF(*(PyObject**)addr);
*(PyObject**)addr = NULL;
return 1;
}
- is_buffer = PyObject_CheckBuffer(arg);
- if (!is_buffer) {
- path = PyOS_FSPath(arg);
- if (path == NULL) {
- return 0;
- }
- }
- else {
- path = arg;
- Py_INCREF(arg);
+ PyObject *path = PyOS_FSPath(arg);
+ if (path == NULL) {
+ return 0;
}
+ PyObject *output = NULL;
if (PyUnicode_Check(path)) {
output = path;
}
- else if (PyBytes_Check(path) || is_buffer) {
- PyObject *path_bytes = NULL;
-
- if (!PyBytes_Check(path) &&
- PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
- "path should be string, bytes, or os.PathLike, not %.200s",
- Py_TYPE(arg)->tp_name)) {
- Py_DECREF(path);
- return 0;
- }
- path_bytes = PyBytes_FromObject(path);
+ else if (PyBytes_Check(path)) {
+ output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
+ PyBytes_GET_SIZE(path));
Py_DECREF(path);
- if (!path_bytes) {
- return 0;
- }
- output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
- PyBytes_GET_SIZE(path_bytes));
- Py_DECREF(path_bytes);
if (!output) {
return 0;
}
@@ -4156,10 +3817,7 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
Py_DECREF(path);
return 0;
}
- if (PyUnicode_READY(output) == -1) {
- Py_DECREF(output);
- return 0;
- }
+
if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
PyErr_SetString(PyExc_ValueError, "embedded null character");
@@ -4180,8 +3838,6 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
if (PyUnicode_UTF8(unicode) == NULL) {
if (unicode_fill_utf8(unicode) == -1) {
@@ -4200,85 +3856,22 @@ PyUnicode_AsUTF8(PyObject *unicode)
return PyUnicode_AsUTF8AndSize(unicode, NULL);
}
-Py_UNICODE *
-PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
-{
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- return NULL;
- }
- Py_UNICODE *w = _PyUnicode_WSTR(unicode);
- if (w == NULL) {
- /* Non-ASCII compact unicode object */
- assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
- assert(PyUnicode_IS_READY(unicode));
-
- Py_ssize_t wlen = unicode_get_widechar_size(unicode);
- if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
- PyErr_NoMemory();
- return NULL;
- }
- w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
- if (w == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- unicode_copy_as_widechar(unicode, w, wlen + 1);
- _PyUnicode_WSTR(unicode) = w;
- if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
- _PyUnicode_WSTR_LENGTH(unicode) = wlen;
- }
- }
- if (size != NULL)
- *size = PyUnicode_WSTR_LENGTH(unicode);
- return w;
-}
-
-/* Deprecated APIs */
-
-_Py_COMP_DIAG_PUSH
-_Py_COMP_DIAG_IGNORE_DEPR_DECLS
-
-Py_UNICODE *
-PyUnicode_AsUnicode(PyObject *unicode)
-{
- return PyUnicode_AsUnicodeAndSize(unicode, NULL);
-}
-
-const Py_UNICODE *
-_PyUnicode_AsUnicode(PyObject *unicode)
-{
- Py_ssize_t size;
- const Py_UNICODE *wstr;
-
- wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
- if (wstr && wcslen(wstr) != (size_t)size) {
- PyErr_SetString(PyExc_ValueError, "embedded null character");
- return NULL;
- }
- return wstr;
-}
-
+/*
+PyUnicode_GetSize() has been deprecated since Python 3.3
+because it returned length of Py_UNICODE.
-Py_ssize_t
+But this function is part of stable abi, because it don't
+include Py_UNICODE in signature and it was not excluded from
+stable abi in PEP 384.
+*/
+PyAPI_FUNC(Py_ssize_t)
PyUnicode_GetSize(PyObject *unicode)
{
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- goto onError;
- }
- if (_PyUnicode_WSTR(unicode) == NULL) {
- if (PyUnicode_AsUnicode(unicode) == NULL)
- goto onError;
- }
- return PyUnicode_WSTR_LENGTH(unicode);
-
- onError:
+ PyErr_SetString(PyExc_RuntimeError,
+ "PyUnicode_GetSize has been removed.");
return -1;
}
-_Py_COMP_DIAG_POP
-
Py_ssize_t
PyUnicode_GetLength(PyObject *unicode)
{
@@ -4286,8 +3879,6 @@ PyUnicode_GetLength(PyObject *unicode)
PyErr_BadArgument();
return -1;
}
- if (PyUnicode_READY(unicode) == -1)
- return -1;
return PyUnicode_GET_LENGTH(unicode);
}
@@ -4301,9 +3892,6 @@ PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
PyErr_BadArgument();
return (Py_UCS4)-1;
}
- if (PyUnicode_READY(unicode) == -1) {
- return (Py_UCS4)-1;
- }
if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return (Py_UCS4)-1;
@@ -4320,7 +3908,6 @@ PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
PyErr_BadArgument();
return -1;
}
- assert(PyUnicode_IS_READY(unicode));
if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return -1;
@@ -4453,19 +4040,10 @@ unicode_decode_call_errorhandler_wchar(
goto onError;
}
-#if USE_UNICODE_WCHAR_CACHE
-_Py_COMP_DIAG_PUSH
-_Py_COMP_DIAG_IGNORE_DEPR_DECLS
- repwlen = PyUnicode_GetSize(repunicode);
- if (repwlen < 0)
- goto onError;
-_Py_COMP_DIAG_POP
-#else /* USE_UNICODE_WCHAR_CACHE */
repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
if (repwlen < 0)
goto onError;
repwlen--;
-#endif /* USE_UNICODE_WCHAR_CACHE */
/* need more space? (at least enough for what we
have+the replacement+the rest of the string (starting
at the new input position), so we won't have to check space
@@ -4915,8 +4493,6 @@ _PyUnicode_EncodeUTF7(PyObject *str,
char * out;
const char * start;
- if (PyUnicode_READY(str) == -1)
- return NULL;
kind = PyUnicode_KIND(str);
data = PyUnicode_DATA(str);
len = PyUnicode_GET_LENGTH(str);
@@ -5548,14 +5124,11 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
-
if (PyUnicode_UTF8(unicode))
return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
PyUnicode_UTF8_LENGTH(unicode));
- enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+ int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
@@ -5591,7 +5164,7 @@ unicode_fill_utf8(PyObject *unicode)
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode));
- enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+ int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
@@ -5726,7 +5299,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
if (e - q >= 4) {
- enum PyUnicode_Kind kind = writer.kind;
+ int kind = writer.kind;
void *data = writer.data;
const unsigned char *last = e - 4;
Py_ssize_t pos = writer.pos;
@@ -5811,7 +5384,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
const char *errors,
int byteorder)
{
- enum PyUnicode_Kind kind;
+ int kind;
const void *data;
Py_ssize_t len;
PyObject *v;
@@ -5831,8 +5404,6 @@ _PyUnicode_EncodeUTF32(PyObject *str,
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(str) == -1)
- return NULL;
kind = PyUnicode_KIND(str);
data = PyUnicode_DATA(str);
len = PyUnicode_GET_LENGTH(str);
@@ -5899,8 +5470,6 @@ _PyUnicode_EncodeUTF32(PyObject *str,
}
else {
assert(PyUnicode_Check(rep));
- if (PyUnicode_READY(rep) < 0)
- goto error;
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, encoding,
@@ -6132,7 +5701,7 @@ _PyUnicode_EncodeUTF16(PyObject *str,
const char *errors,
int byteorder)
{
- enum PyUnicode_Kind kind;
+ int kind;
const void *data;
Py_ssize_t len;
PyObject *v;
@@ -6153,8 +5722,6 @@ _PyUnicode_EncodeUTF16(PyObject *str,
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(str) == -1)
- return NULL;
kind = PyUnicode_KIND(str);
data = PyUnicode_DATA(str);
len = PyUnicode_GET_LENGTH(str);
@@ -6238,8 +5805,6 @@ _PyUnicode_EncodeUTF16(PyObject *str,
}
else {
assert(PyUnicode_Check(rep));
- if (PyUnicode_READY(rep) < 0)
- goto error;
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, encoding,
@@ -6303,8 +5868,6 @@ PyUnicode_AsUTF16String(PyObject *unicode)
/* --- Unicode Escape Codec ----------------------------------------------- */
-static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
-
PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Py_ssize_t size,
@@ -6317,6 +5880,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
const char *end;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
+ _PyUnicode_Name_CAPI *ucnhash_capi;
+ PyInterpreterState *interp = _PyInterpreterState_Get();
// so we can remember if we've seen an invalid escape char or not
*first_invalid_escape = NULL;
@@ -6464,6 +6029,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
/* \N{name} */
case 'N':
+ ucnhash_capi = interp->unicode.ucnhash_capi;
if (ucnhash_capi == NULL) {
/* load the unicode data module */
ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
@@ -6475,6 +6041,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
);
goto onError;
}
+ interp->unicode.ucnhash_capi = ucnhash_capi;
}
message = "malformed \\N character escape";
@@ -6601,7 +6168,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Py_ssize_t i, len;
PyObject *repr;
char *p;
- enum PyUnicode_Kind kind;
+ int kind;
const void *data;
Py_ssize_t expandsize;
@@ -6617,9 +6184,6 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(unicode) == -1) {
- return NULL;
- }
len = PyUnicode_GET_LENGTH(unicode);
if (len == 0) {
@@ -6874,9 +6438,6 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(unicode) == -1) {
- return NULL;
- }
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
len = PyUnicode_GET_LENGTH(unicode);
@@ -7013,8 +6574,6 @@ unicode_encode_call_errorhandler(const char *errors,
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
len = PyUnicode_GET_LENGTH(unicode);
make_encode_exception(exceptionObject,
@@ -7072,8 +6631,6 @@ unicode_encode_ucs1(PyObject *unicode,
/* output object */
_PyBytesWriter writer;
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
size = PyUnicode_GET_LENGTH(unicode);
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
@@ -7192,9 +6749,6 @@ unicode_encode_ucs1(PyObject *unicode,
else {
assert(PyUnicode_Check(rep));
- if (PyUnicode_READY(rep) < 0)
- goto onError;
-
if (limit == 256 ?
PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
!PyUnicode_IS_ASCII(rep))
@@ -7241,8 +6795,6 @@ _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
/* Fast path: if it is a one-byte string, construct
bytes object directly. */
if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
@@ -7367,8 +6919,6 @@ _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
/* Fast path: if it is an ASCII-only string, construct bytes object
directly. Else defer to above function to raise the exception. */
if (PyUnicode_IS_ASCII(unicode))
@@ -7756,22 +7306,11 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
substring = PyUnicode_Substring(unicode, offset, offset+len);
if (substring == NULL)
return -1;
-#if USE_UNICODE_WCHAR_CACHE
-_Py_COMP_DIAG_PUSH
-_Py_COMP_DIAG_IGNORE_DEPR_DECLS
- p = PyUnicode_AsUnicodeAndSize(substring, &size);
- if (p == NULL) {
- Py_DECREF(substring);
- return -1;
- }
-_Py_COMP_DIAG_POP
-#else /* USE_UNICODE_WCHAR_CACHE */
p = PyUnicode_AsWideCharString(substring, &size);
Py_CLEAR(substring);
if (p == NULL) {
return -1;
}
-#endif /* USE_UNICODE_WCHAR_CACHE */
assert(size <= INT_MAX);
/* First get the size of the result */
@@ -7822,11 +7361,7 @@ _Py_COMP_DIAG_POP
ret = 0;
done:
-#if USE_UNICODE_WCHAR_CACHE
- Py_DECREF(substring);
-#else /* USE_UNICODE_WCHAR_CACHE */
PyMem_Free(p);
-#endif /* USE_UNICODE_WCHAR_CACHE */
return ret;
error:
@@ -7976,14 +7511,9 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
}
else {
Py_ssize_t i;
- enum PyUnicode_Kind kind;
+ int kind;
const void *data;
- if (PyUnicode_READY(rep) == -1) {
- Py_DECREF(rep);
- goto error;
- }
-
outsize = PyUnicode_GET_LENGTH(rep);
morebytes += outsize;
if (morebytes > 0) {
@@ -8044,8 +7574,6 @@ encode_code_page(int code_page,
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
len = PyUnicode_GET_LENGTH(unicode);
if (code_page < 0) {
@@ -8122,14 +7650,11 @@ charmap_decode_string(const char *s,
Py_ssize_t startinpos, endinpos;
PyObject *errorHandler = NULL, *exc = NULL;
Py_ssize_t maplen;
- enum PyUnicode_Kind mapkind;
+ int mapkind;
const void *mapdata;
Py_UCS4 x;
unsigned char ch;
- if (PyUnicode_READY(mapping) == -1)
- return -1;
-
maplen = PyUnicode_GET_LENGTH(mapping);
mapdata = PyUnicode_DATA(mapping);
mapkind = PyUnicode_KIND(mapping);
@@ -8163,7 +7688,7 @@ charmap_decode_string(const char *s,
while (s < e) {
if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
- enum PyUnicode_Kind outkind = writer->kind;
+ int outkind = writer->kind;
const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
if (outkind == PyUnicode_1BYTE_KIND) {
Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
@@ -8282,8 +7807,6 @@ charmap_decode_mapping(const char *s,
goto onError;
}
else if (PyUnicode_Check(item)) {
- if (PyUnicode_READY(item) == -1)
- goto onError;
if (PyUnicode_GET_LENGTH(item) == 1) {
Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
if (value == 0xFFFE)
@@ -8689,7 +8212,7 @@ charmap_encoding_error(
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Py_ssize_t size, repsize;
Py_ssize_t newpos;
- enum PyUnicode_Kind kind;
+ int kind;
const void *data;
Py_ssize_t index;
/* startpos for collecting unencodable chars */
@@ -8702,8 +8225,6 @@ charmap_encoding_error(
Py_UCS4 ch;
int val;
- if (PyUnicode_READY(unicode) == -1)
- return -1;
size = PyUnicode_GET_LENGTH(unicode);
/* find all unencodable characters */
while (collendpos < size) {
@@ -8799,10 +8320,6 @@ charmap_encoding_error(
break;
}
/* generate replacement */
- if (PyUnicode_READY(repunicode) == -1) {
- Py_DECREF(repunicode);
- return -1;
- }
repsize = PyUnicode_GET_LENGTH(repunicode);
data = PyUnicode_DATA(repunicode);
kind = PyUnicode_KIND(repunicode);
@@ -8843,8 +8360,6 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
const void *data;
int kind;
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
size = PyUnicode_GET_LENGTH(unicode);
data = PyUnicode_DATA(unicode);
kind = PyUnicode_KIND(unicode);
@@ -9123,10 +8638,6 @@ unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
else if (PyUnicode_Check(item)) {
Py_UCS4 replace;
- if (PyUnicode_READY(item) == -1) {
- Py_DECREF(item);
- return -1;
- }
if (PyUnicode_GET_LENGTH(item) != 1)
goto exit;
@@ -9223,8 +8734,6 @@ _PyUnicode_TranslateCharmap(PyObject *input,
return NULL;
}
- if (PyUnicode_READY(input) == -1)
- return NULL;
data = PyUnicode_DATA(input);
kind = PyUnicode_KIND(input);
size = PyUnicode_GET_LENGTH(input);
@@ -9240,8 +8749,6 @@ _PyUnicode_TranslateCharmap(PyObject *input,
ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
- if (PyUnicode_READY(input) == -1)
- return NULL;
if (PyUnicode_IS_ASCII(input)) {
res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
if (res < 0) {
@@ -9337,12 +8844,9 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
PyErr_BadInternalCall();
return NULL;
}
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
if (PyUnicode_IS_ASCII(unicode)) {
/* If the string is already ASCII, just return the same string */
- Py_INCREF(unicode);
- return unicode;
+ return Py_NewRef(unicode);
}
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
@@ -9530,15 +9034,6 @@ _PyUnicode_InsertThousandsGrouping(
assert(0 <= n_digits);
assert(grouping != NULL);
- if (digits != NULL) {
- if (PyUnicode_READY(digits) == -1) {
- return -1;
- }
- }
- if (PyUnicode_READY(thousands_sep) == -1) {
- return -1;
- }
-
Py_ssize_t count = 0;
Py_ssize_t n_zeros;
int loop_broken = 0;
@@ -9624,21 +9119,20 @@ _PyUnicode_InsertThousandsGrouping(
return count;
}
-
-Py_ssize_t
-PyUnicode_Count(PyObject *str,
- PyObject *substr,
- Py_ssize_t start,
- Py_ssize_t end)
+static Py_ssize_t
+unicode_count_impl(PyObject *str,
+ PyObject *substr,
+ Py_ssize_t start,
+ Py_ssize_t end)
{
+ assert(PyUnicode_Check(str));
+ assert(PyUnicode_Check(substr));
+
Py_ssize_t result;
int kind1, kind2;
const void *buf1 = NULL, *buf2 = NULL;
Py_ssize_t len1, len2;
- if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
- return -1;
-
kind1 = PyUnicode_KIND(str);
kind2 = PyUnicode_KIND(substr);
if (kind1 < kind2)
@@ -9658,18 +9152,13 @@ PyUnicode_Count(PyObject *str,
goto onError;
}
+ // We don't reuse `anylib_count` here because of the explicit casts.
switch (kind1) {
case PyUnicode_1BYTE_KIND:
- if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
- result = asciilib_count(
- ((const Py_UCS1*)buf1) + start, end - start,
- buf2, len2, PY_SSIZE_T_MAX
- );
- else
- result = ucs1lib_count(
- ((const Py_UCS1*)buf1) + start, end - start,
- buf2, len2, PY_SSIZE_T_MAX
- );
+ result = ucs1lib_count(
+ ((const Py_UCS1*)buf1) + start, end - start,
+ buf2, len2, PY_SSIZE_T_MAX
+ );
break;
case PyUnicode_2BYTE_KIND:
result = ucs2lib_count(
@@ -9700,6 +9189,18 @@ PyUnicode_Count(PyObject *str,
}
Py_ssize_t
+PyUnicode_Count(PyObject *str,
+ PyObject *substr,
+ Py_ssize_t start,
+ Py_ssize_t end)
+{
+ if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
+ return -1;
+
+ return unicode_count_impl(str, substr, start, end);
+}
+
+Py_ssize_t
PyUnicode_Find(PyObject *str,
PyObject *substr,
Py_ssize_t start,
@@ -9719,8 +9220,6 @@ PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
{
int kind;
Py_ssize_t len, result;
- if (PyUnicode_READY(str) == -1)
- return -2;
len = PyUnicode_GET_LENGTH(str);
ADJUST_INDICES(start, end, len);
if (end - start < 1)
@@ -9749,10 +9248,6 @@ tailmatch(PyObject *self,
Py_ssize_t i;
Py_ssize_t end_sub;
- if (PyUnicode_READY(self) == -1 ||
- PyUnicode_READY(substring) == -1)
- return -1;
-
ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
end -= PyUnicode_GET_LENGTH(substring);
if (end < start)
@@ -10011,8 +9506,6 @@ case_operation(PyObject *self,
void *outdata;
Py_UCS4 maxchar = 0, *tmp, *tmpend;
- assert(PyUnicode_IS_READY(self));
-
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
length = PyUnicode_GET_LENGTH(self);
@@ -10085,7 +9578,7 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq
int use_memcpy;
unsigned char *res_data = NULL, *sep_data = NULL;
PyObject *last_obj;
- unsigned int kind = 0;
+ int kind = 0;
/* If empty sequence, return u"". */
if (seqlen == 0) {
@@ -10097,8 +9590,7 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq
if (seqlen == 1) {
if (PyUnicode_CheckExact(items[0])) {
res = items[0];
- Py_INCREF(res);
- return res;
+ return Py_NewRef(res);
}
seplen = 0;
maxchar = 0;
@@ -10121,8 +9613,6 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq
Py_TYPE(separator)->tp_name);
goto onError;
}
- if (PyUnicode_READY(separator))
- goto onError;
sep = separator;
seplen = PyUnicode_GET_LENGTH(separator);
maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
@@ -10154,8 +9644,6 @@ _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seq
i, Py_TYPE(item)->tp_name);
goto onError;
}
- if (PyUnicode_READY(item) == -1)
- goto onError;
add_sz = PyUnicode_GET_LENGTH(item);
item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
maxchar = Py_MAX(maxchar, item_maxchar);
@@ -10248,9 +9736,8 @@ void
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
Py_UCS4 fill_char)
{
- const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+ const int kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
- assert(PyUnicode_IS_READY(unicode));
assert(unicode_modifiable(unicode));
assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
assert(start >= 0);
@@ -10268,8 +9755,6 @@ PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
PyErr_BadInternalCall();
return -1;
}
- if (PyUnicode_READY(unicode) == -1)
- return -1;
if (unicode_check_modifiable(unicode))
return -1;
@@ -10378,53 +9863,53 @@ split(PyObject *self,
const void *buf1, *buf2;
Py_ssize_t len1, len2;
PyObject* out;
+ len1 = PyUnicode_GET_LENGTH(self);
+ kind1 = PyUnicode_KIND(self);
- if (maxcount < 0)
- maxcount = PY_SSIZE_T_MAX;
-
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
- if (substring == NULL)
- switch (PyUnicode_KIND(self)) {
+ if (substring == NULL) {
+ if (maxcount < 0) {
+ maxcount = (len1 - 1) / 2 + 1;
+ }
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(self))
return asciilib_split_whitespace(
self, PyUnicode_1BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
else
return ucs1lib_split_whitespace(
self, PyUnicode_1BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
case PyUnicode_2BYTE_KIND:
return ucs2lib_split_whitespace(
self, PyUnicode_2BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
case PyUnicode_4BYTE_KIND:
return ucs4lib_split_whitespace(
self, PyUnicode_4BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
default:
Py_UNREACHABLE();
}
+ }
- if (PyUnicode_READY(substring) == -1)
- return NULL;
-
- kind1 = PyUnicode_KIND(self);
kind2 = PyUnicode_KIND(substring);
- len1 = PyUnicode_GET_LENGTH(self);
len2 = PyUnicode_GET_LENGTH(substring);
+ if (maxcount < 0) {
+ // if len2 == 0, it will raise ValueError.
+ maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
+ // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
+ maxcount = maxcount < 0 ? len1 : maxcount;
+ }
if (kind1 < kind2 || len1 < len2) {
out = PyList_New(1);
if (out == NULL)
return NULL;
- Py_INCREF(self);
- PyList_SET_ITEM(out, 0, self);
+ PyList_SET_ITEM(out, 0, Py_NewRef(self));
return out;
}
buf1 = PyUnicode_DATA(self);
@@ -10471,52 +9956,52 @@ rsplit(PyObject *self,
Py_ssize_t len1, len2;
PyObject* out;
- if (maxcount < 0)
- maxcount = PY_SSIZE_T_MAX;
-
- if (PyUnicode_READY(self) == -1)
- return NULL;
+ len1 = PyUnicode_GET_LENGTH(self);
+ kind1 = PyUnicode_KIND(self);
- if (substring == NULL)
- switch (PyUnicode_KIND(self)) {
+ if (substring == NULL) {
+ if (maxcount < 0) {
+ maxcount = (len1 - 1) / 2 + 1;
+ }
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(self))
return asciilib_rsplit_whitespace(
self, PyUnicode_1BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
else
return ucs1lib_rsplit_whitespace(
self, PyUnicode_1BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
case PyUnicode_2BYTE_KIND:
return ucs2lib_rsplit_whitespace(
self, PyUnicode_2BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
case PyUnicode_4BYTE_KIND:
return ucs4lib_rsplit_whitespace(
self, PyUnicode_4BYTE_DATA(self),
- PyUnicode_GET_LENGTH(self), maxcount
+ len1, maxcount
);
default:
Py_UNREACHABLE();
}
-
- if (PyUnicode_READY(substring) == -1)
- return NULL;
-
- kind1 = PyUnicode_KIND(self);
+ }
kind2 = PyUnicode_KIND(substring);
- len1 = PyUnicode_GET_LENGTH(self);
len2 = PyUnicode_GET_LENGTH(substring);
+ if (maxcount < 0) {
+ // if len2 == 0, it will raise ValueError.
+ maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
+ // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
+ maxcount = maxcount < 0 ? len1 : maxcount;
+ }
if (kind1 < kind2 || len1 < len2) {
out = PyList_New(1);
if (out == NULL)
return NULL;
- Py_INCREF(self);
- PyList_SET_ITEM(out, 0, self);
+ PyList_SET_ITEM(out, 0, Py_NewRef(self));
return out;
}
buf1 = PyUnicode_DATA(self);
@@ -10577,10 +10062,7 @@ anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
{
switch (kind) {
case PyUnicode_1BYTE_KIND:
- if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
- return asciilib_count(sbuf, slen, buf1, len1, maxcount);
- else
- return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
+ return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
case PyUnicode_2BYTE_KIND:
return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
case PyUnicode_4BYTE_KIND:
@@ -10908,8 +10390,6 @@ static PyObject *
unicode_title_impl(PyObject *self)
/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
return case_operation(self, do_title);
}
@@ -10926,8 +10406,6 @@ static PyObject *
unicode_capitalize_impl(PyObject *self)
/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
if (PyUnicode_GET_LENGTH(self) == 0)
return unicode_result_unchanged(self);
return case_operation(self, do_capitalize);
@@ -10943,8 +10421,6 @@ static PyObject *
unicode_casefold_impl(PyObject *self)
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
if (PyUnicode_IS_ASCII(self))
return ascii_upper_or_lower(self, 1);
return case_operation(self, do_casefold);
@@ -10964,8 +10440,6 @@ convert_uc(PyObject *obj, void *addr)
"not %.100s", Py_TYPE(obj)->tp_name);
return 0;
}
- if (PyUnicode_READY(obj) < 0)
- return 0;
if (PyUnicode_GET_LENGTH(obj) != 1) {
PyErr_SetString(PyExc_TypeError,
"The fill character must be exactly one character long");
@@ -10993,9 +10467,6 @@ unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
{
Py_ssize_t marg, left;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
if (PyUnicode_GET_LENGTH(self) >= width)
return unicode_result_unchanged(self);
@@ -11152,9 +10623,6 @@ _PyUnicode_Equal(PyObject *str1, PyObject *str2)
if (str1 == str2) {
return 1;
}
- if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) {
- return -1;
- }
return unicode_compare_eq(str1, str2);
}
@@ -11163,10 +10631,6 @@ int
PyUnicode_Compare(PyObject *left, PyObject *right)
{
if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
- if (PyUnicode_READY(left) == -1 ||
- PyUnicode_READY(right) == -1)
- return -1;
-
/* a string is equal to itself */
if (left == right)
return 0;
@@ -11186,24 +10650,8 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
Py_ssize_t i;
int kind;
Py_UCS4 chr;
- const unsigned char *ustr = (const unsigned char *)str;
assert(_PyUnicode_CHECK(uni));
- if (!PyUnicode_IS_READY(uni)) {
- const wchar_t *ws = _PyUnicode_WSTR(uni);
- /* Compare Unicode string and source character set string */
- for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
- if (chr != ustr[i])
- return (chr < ustr[i]) ? -1 : 1;
- }
- /* This check keeps Python strings that end in '\0' from comparing equal
- to C strings identical up to that point. */
- if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
- return 1; /* uni is longer */
- if (ustr[i])
- return -1; /* str is longer */
- return 0;
- }
kind = PyUnicode_KIND(uni);
if (kind == PyUnicode_1BYTE_KIND) {
const void *data = PyUnicode_1BYTE_DATA(uni);
@@ -11241,24 +10689,6 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
}
}
-static int
-non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
-{
- size_t i, len;
- const wchar_t *p;
- len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
- if (strlen(str) != len)
- return 0;
- p = _PyUnicode_WSTR(unicode);
- assert(p);
- for (i = 0; i < len; i++) {
- unsigned char c = (unsigned char)str[i];
- if (c >= 128 || p[i] != (wchar_t)c)
- return 0;
- }
- return 1;
-}
-
int
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
{
@@ -11270,11 +10700,6 @@ _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
assert((unsigned char)*p < 128);
}
#endif
- if (PyUnicode_READY(unicode) == -1) {
- /* Memory error or bad data */
- PyErr_Clear();
- return non_ready_unicode_equal_to_ascii_string(unicode, str);
- }
if (!PyUnicode_IS_ASCII(unicode))
return 0;
len = (size_t)PyUnicode_GET_LENGTH(unicode);
@@ -11295,12 +10720,6 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
}
#endif
- if (PyUnicode_READY(left) == -1) {
- /* memory error or bad data */
- PyErr_Clear();
- return non_ready_unicode_equal_to_ascii_string(left, right->string);
- }
-
if (!PyUnicode_IS_ASCII(left))
return 0;
@@ -11334,10 +10753,6 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
Py_RETURN_NOTIMPLEMENTED;
- if (PyUnicode_READY(left) == -1 ||
- PyUnicode_READY(right) == -1)
- return NULL;
-
if (left == right) {
switch (op) {
case Py_EQ:
@@ -11385,8 +10800,6 @@ PyUnicode_Contains(PyObject *str, PyObject *substr)
Py_TYPE(substr)->tp_name);
return -1;
}
- if (PyUnicode_READY(substr) == -1)
- return -1;
if (ensure_unicode(str) < 0)
return -1;
@@ -11450,8 +10863,6 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
Py_TYPE(right)->tp_name);
return NULL;
}
- if (PyUnicode_READY(right) < 0)
- return NULL;
/* Shortcuts */
PyObject *empty = unicode_get_empty(); // Borrowed reference
@@ -11505,17 +10916,11 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
goto error;
}
- if (PyUnicode_READY(left) == -1)
- goto error;
- if (PyUnicode_READY(right) == -1)
- goto error;
-
/* Shortcuts */
PyObject *empty = unicode_get_empty(); // Borrowed reference
if (left == empty) {
Py_DECREF(left);
- Py_INCREF(right);
- *p_left = right;
+ *p_left = Py_NewRef(right);
return;
}
if (right == empty) {
@@ -11576,7 +10981,7 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
}
/*
-Wraps stringlib_parse_args_finds() and additionally ensures that the
+Wraps asciilib_parse_args_finds() and additionally ensures that the
first argument is a unicode object.
*/
@@ -11585,8 +10990,7 @@ parse_args_finds_unicode(const char * function_name, PyObject *args,
PyObject **substring,
Py_ssize_t *start, Py_ssize_t *end)
{
- if(stringlib_parse_args_finds(function_name, args, substring,
- start, end)) {
+ if (asciilib_parse_args_finds(function_name, args, substring, start, end)) {
if (ensure_unicode(*substring) < 0)
return 0;
return 1;
@@ -11607,62 +11011,16 @@ unicode_count(PyObject *self, PyObject *args)
PyObject *substring = NULL; /* initialize to fix a compiler warning */
Py_ssize_t start = 0;
Py_ssize_t end = PY_SSIZE_T_MAX;
- PyObject *result;
- int kind1, kind2;
- const void *buf1, *buf2;
- Py_ssize_t len1, len2, iresult;
+ Py_ssize_t result;
if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
return NULL;
- kind1 = PyUnicode_KIND(self);
- kind2 = PyUnicode_KIND(substring);
- if (kind1 < kind2)
- return PyLong_FromLong(0);
-
- len1 = PyUnicode_GET_LENGTH(self);
- len2 = PyUnicode_GET_LENGTH(substring);
- ADJUST_INDICES(start, end, len1);
- if (end - start < len2)
- return PyLong_FromLong(0);
-
- buf1 = PyUnicode_DATA(self);
- buf2 = PyUnicode_DATA(substring);
- if (kind2 != kind1) {
- buf2 = unicode_askind(kind2, buf2, len2, kind1);
- if (!buf2)
- return NULL;
- }
- switch (kind1) {
- case PyUnicode_1BYTE_KIND:
- iresult = ucs1lib_count(
- ((const Py_UCS1*)buf1) + start, end - start,
- buf2, len2, PY_SSIZE_T_MAX
- );
- break;
- case PyUnicode_2BYTE_KIND:
- iresult = ucs2lib_count(
- ((const Py_UCS2*)buf1) + start, end - start,
- buf2, len2, PY_SSIZE_T_MAX
- );
- break;
- case PyUnicode_4BYTE_KIND:
- iresult = ucs4lib_count(
- ((const Py_UCS4*)buf1) + start, end - start,
- buf2, len2, PY_SSIZE_T_MAX
- );
- break;
- default:
- Py_UNREACHABLE();
- }
-
- result = PyLong_FromSsize_t(iresult);
-
- assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
- if (kind2 != kind1)
- PyMem_Free((void *)buf2);
+ result = unicode_count_impl(self, substring, start, end);
+ if (result == -1)
+ return NULL;
- return result;
+ return PyLong_FromSsize_t(result);
}
/*[clinic input]
@@ -11709,9 +11067,6 @@ unicode_expandtabs_impl(PyObject *self, int tabsize)
int kind;
int found;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
/* First pass: determine size of output string */
src_len = PyUnicode_GET_LENGTH(self);
i = j = line_pos = 0;
@@ -11797,9 +11152,6 @@ unicode_find(PyObject *self, PyObject *args)
if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
result = any_find_slice(self, substring, start, end, 1);
if (result == -2)
@@ -11812,16 +11164,13 @@ static PyObject *
unicode_getitem(PyObject *self, Py_ssize_t index)
{
const void *data;
- enum PyUnicode_Kind kind;
+ int kind;
Py_UCS4 ch;
if (!PyUnicode_Check(self)) {
PyErr_BadArgument();
return NULL;
}
- if (PyUnicode_READY(self) == -1) {
- return NULL;
- }
if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return NULL;
@@ -11844,8 +11193,6 @@ unicode_hash(PyObject *self)
#endif
if (_PyUnicode_HASH(self) != -1)
return _PyUnicode_HASH(self);
- if (PyUnicode_READY(self) == -1)
- return -1;
x = _Py_HashBytes(PyUnicode_DATA(self),
PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
@@ -11874,9 +11221,6 @@ unicode_index(PyObject *self, PyObject *args)
if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
result = any_find_slice(self, substring, start, end, 1);
if (result == -2)
@@ -11903,9 +11247,6 @@ static PyObject *
unicode_isascii_impl(PyObject *self)
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
{
- if (PyUnicode_READY(self) == -1) {
- return NULL;
- }
return PyBool_FromLong(PyUnicode_IS_ASCII(self));
}
@@ -11927,8 +11268,6 @@ unicode_islower_impl(PyObject *self)
const void *data;
int cased;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -11972,8 +11311,6 @@ unicode_isupper_impl(PyObject *self)
const void *data;
int cased;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12017,8 +11354,6 @@ unicode_istitle_impl(PyObject *self)
const void *data;
int cased, previous_is_cased;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12074,8 +11409,6 @@ unicode_isspace_impl(PyObject *self)
int kind;
const void *data;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12114,8 +11447,6 @@ unicode_isalpha_impl(PyObject *self)
int kind;
const void *data;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12153,9 +11484,6 @@ unicode_isalnum_impl(PyObject *self)
const void *data;
Py_ssize_t len, i;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
len = PyUnicode_GET_LENGTH(self);
@@ -12195,8 +11523,6 @@ unicode_isdecimal_impl(PyObject *self)
int kind;
const void *data;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12234,8 +11560,6 @@ unicode_isdigit_impl(PyObject *self)
int kind;
const void *data;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12274,8 +11598,6 @@ unicode_isnumeric_impl(PyObject *self)
int kind;
const void *data;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12300,9 +11622,6 @@ Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject *self)
{
Py_ssize_t i;
- if (PyUnicode_READY(self) == -1)
- return -1;
-
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) {
/* an empty string is not a valid identifier */
@@ -12336,54 +11655,10 @@ _PyUnicode_ScanIdentifier(PyObject *self)
int
PyUnicode_IsIdentifier(PyObject *self)
{
- if (PyUnicode_IS_READY(self)) {
- Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
- Py_ssize_t len = PyUnicode_GET_LENGTH(self);
- /* an empty string is not a valid identifier */
- return len && i == len;
- }
- else {
-_Py_COMP_DIAG_PUSH
-_Py_COMP_DIAG_IGNORE_DEPR_DECLS
- Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
- if (len == 0) {
- /* an empty string is not a valid identifier */
- return 0;
- }
-
- const wchar_t *wstr = _PyUnicode_WSTR(self);
- Py_UCS4 ch = wstr[i++];
-#if SIZEOF_WCHAR_T == 2
- if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
- && i < len
- && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
- {
- ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
- i++;
- }
-#endif
- if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
- return 0;
- }
-
- while (i < len) {
- ch = wstr[i++];
-#if SIZEOF_WCHAR_T == 2
- if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
- && i < len
- && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
- {
- ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
- i++;
- }
-#endif
- if (!_PyUnicode_IsXidContinue(ch)) {
- return 0;
- }
- }
- return 1;
-_Py_COMP_DIAG_POP
- }
+ Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+ /* an empty string is not a valid identifier */
+ return len && i == len;
}
/*[clinic input]
@@ -12419,8 +11694,6 @@ unicode_isprintable_impl(PyObject *self)
int kind;
const void *data;
- if (PyUnicode_READY(self) == -1)
- return NULL;
length = PyUnicode_GET_LENGTH(self);
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
@@ -12462,8 +11735,6 @@ unicode_join(PyObject *self, PyObject *iterable)
static Py_ssize_t
unicode_length(PyObject *self)
{
- if (PyUnicode_READY(self) == -1)
- return -1;
return PyUnicode_GET_LENGTH(self);
}
@@ -12483,9 +11754,6 @@ static PyObject *
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
if (PyUnicode_GET_LENGTH(self) >= width)
return unicode_result_unchanged(self);
@@ -12502,8 +11770,6 @@ static PyObject *
unicode_lower_impl(PyObject *self)
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
if (PyUnicode_IS_ASCII(self))
return ascii_upper_or_lower(self, 1);
return case_operation(self, do_lower);
@@ -12528,9 +11794,6 @@ _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
BLOOM_MASK sepmask;
Py_ssize_t seplen;
- if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
- return NULL;
-
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
len = PyUnicode_GET_LENGTH(self);
@@ -12576,9 +11839,6 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
int kind;
Py_ssize_t length;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
length = PyUnicode_GET_LENGTH(self);
end = Py_MIN(end, length);
@@ -12611,9 +11871,6 @@ do_strip(PyObject *self, int striptype)
{
Py_ssize_t len, i, j;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
len = PyUnicode_GET_LENGTH(self);
if (PyUnicode_IS_ASCII(self)) {
@@ -12760,9 +12017,6 @@ unicode_repeat(PyObject *str, Py_ssize_t len)
if (len == 1)
return unicode_result_unchanged(str);
- if (PyUnicode_READY(str) == -1)
- return NULL;
-
if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
PyErr_SetString(PyExc_OverflowError,
"repeated string is too long");
@@ -12837,8 +12091,6 @@ unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
Py_ssize_t count)
/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
return replace(self, old, new, count);
}
@@ -12908,9 +12160,6 @@ unicode_repr(PyObject *unicode)
const void *idata;
void *odata;
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
-
isize = PyUnicode_GET_LENGTH(unicode);
idata = PyUnicode_DATA(unicode);
@@ -13083,9 +12332,6 @@ unicode_rfind(PyObject *self, PyObject *args)
if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
result = any_find_slice(self, substring, start, end, -1);
if (result == -2)
@@ -13115,9 +12361,6 @@ unicode_rindex(PyObject *self, PyObject *args)
if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
result = any_find_slice(self, substring, start, end, -1);
if (result == -2)
@@ -13147,9 +12390,6 @@ static PyObject *
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
if (PyUnicode_GET_LENGTH(self) >= width)
return unicode_result_unchanged(self);
@@ -13384,7 +12624,7 @@ unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
/*[clinic input]
str.splitlines as unicode_splitlines
- keepends: bool(accept={int}) = False
+ keepends: bool = False
Return a list of the lines in the string, breaking at line boundaries.
@@ -13394,7 +12634,7 @@ true.
static PyObject *
unicode_splitlines_impl(PyObject *self, int keepends)
-/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
+/*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/
{
return PyUnicode_Splitlines(self, keepends);
}
@@ -13415,8 +12655,6 @@ static PyObject *
unicode_swapcase_impl(PyObject *self)
/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
return case_operation(self, do_swapcase);
}
@@ -13582,8 +12820,6 @@ static PyObject *
unicode_upper_impl(PyObject *self)
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
if (PyUnicode_IS_ASCII(self))
return ascii_upper_or_lower(self, 0);
return case_operation(self, do_upper);
@@ -13610,9 +12846,6 @@ unicode_zfill_impl(PyObject *self, Py_ssize_t width)
const void *data;
Py_UCS4 chr;
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
if (PyUnicode_GET_LENGTH(self) >= width)
return unicode_result_unchanged(self);
@@ -13655,7 +12888,7 @@ unicode_startswith(PyObject *self,
Py_ssize_t end = PY_SSIZE_T_MAX;
int result;
- if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
+ if (!asciilib_parse_args_finds("startswith", args, &subobj, &start, &end))
return NULL;
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
@@ -13709,7 +12942,7 @@ unicode_endswith(PyObject *self,
Py_ssize_t end = PY_SSIZE_T_MAX;
int result;
- if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
+ if (!asciilib_parse_args_finds("endswith", args, &subobj, &start, &end))
return NULL;
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
@@ -13756,7 +12989,7 @@ _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
else {
/* use a value smaller than PyUnicode_1BYTE_KIND() so
_PyUnicodeWriter_PrepareKind() will copy the buffer. */
- writer->kind = PyUnicode_WCHAR_KIND;
+ writer->kind = 0;
assert(writer->kind <= PyUnicode_1BYTE_KIND);
/* Copy-on-write mode: set buffer size to 0 so
@@ -13776,7 +13009,7 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
/* use a value smaller than PyUnicode_1BYTE_KIND() so
_PyUnicodeWriter_PrepareKind() will copy the buffer. */
- writer->kind = PyUnicode_WCHAR_KIND;
+ writer->kind = 0;
assert(writer->kind <= PyUnicode_1BYTE_KIND);
}
@@ -13869,7 +13102,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
- enum PyUnicode_Kind kind)
+ int kind)
{
Py_UCS4 maxchar;
@@ -13911,8 +13144,6 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
Py_UCS4 maxchar;
Py_ssize_t len;
- if (PyUnicode_READY(str) == -1)
- return -1;
len = PyUnicode_GET_LENGTH(str);
if (len == 0)
return 0;
@@ -13921,8 +13152,7 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
if (writer->buffer == NULL && !writer->overallocate) {
assert(_PyUnicode_CheckConsistency(str, 1));
writer->readonly = 1;
- Py_INCREF(str);
- writer->buffer = str;
+ writer->buffer = Py_NewRef(str);
_PyUnicodeWriter_Update(writer);
writer->pos += len;
return 0;
@@ -13943,9 +13173,6 @@ _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
Py_UCS4 maxchar;
Py_ssize_t len;
- if (PyUnicode_READY(str) == -1)
- return -1;
-
assert(0 <= start);
assert(end <= PyUnicode_GET_LENGTH(str));
assert(start <= end);
@@ -14074,7 +13301,7 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
}
assert(_PyUnicode_CheckConsistency(str, 1));
- return unicode_result_ready(str);
+ return unicode_result(str);
}
void
@@ -14113,8 +13340,6 @@ unicode___format___impl(PyObject *self, PyObject *format_spec)
_PyUnicodeWriter writer;
int ret;
- if (PyUnicode_READY(self) == -1)
- return NULL;
_PyUnicodeWriter_Init(&writer);
ret = _PyUnicode_FormatAdvancedWriter(&writer,
self, format_spec, 0,
@@ -14140,11 +13365,13 @@ unicode_sizeof_impl(PyObject *self)
/* If it's a compact object, account for base structure +
character data. */
- if (PyUnicode_IS_COMPACT_ASCII(self))
+ if (PyUnicode_IS_COMPACT_ASCII(self)) {
size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
- else if (PyUnicode_IS_COMPACT(self))
+ }
+ else if (PyUnicode_IS_COMPACT(self)) {
size = sizeof(PyCompactUnicodeObject) +
(PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
+ }
else {
/* If it is a two-block object, account for base object, and
for character block if present. */
@@ -14153,10 +13380,6 @@ unicode_sizeof_impl(PyObject *self)
size += (PyUnicode_GET_LENGTH(self) + 1) *
PyUnicode_KIND(self);
}
- /* If the wstr pointer is present, account for it unless it is shared
- with the data pointer. Check if the data is not shared. */
- if (_PyUnicode_HAS_WSTR_MEMORY(self))
- size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
if (_PyUnicode_HAS_UTF8_MEMORY(self))
size += PyUnicode_UTF8_LENGTH(self) + 1;
@@ -14255,9 +13478,6 @@ static PySequenceMethods unicode_as_sequence = {
static PyObject*
unicode_subscript(PyObject* self, PyObject* item)
{
- if (PyUnicode_READY(self) == -1)
- return NULL;
-
if (_PyIndex_Check(item)) {
Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
if (i == -1 && PyErr_Occurred())
@@ -14340,7 +13560,7 @@ struct unicode_formatter_t {
Py_ssize_t arglen, argidx;
PyObject *dict;
- enum PyUnicode_Kind fmtkind;
+ int fmtkind;
Py_ssize_t fmtcnt, fmtpos;
const void *fmtdata;
PyObject *fmtstr;
@@ -14479,7 +13699,6 @@ _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
return NULL;
assert(unicode_modifiable(result));
- assert(PyUnicode_IS_READY(result));
assert(PyUnicode_IS_ASCII(result));
/* To modify the string in-place, there can only be one reference. */
@@ -14534,8 +13753,7 @@ _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
for (i = 0; i < numdigits; i++)
*b1++ = *buf++;
*b1 = '\0';
- Py_DECREF(result);
- result = r1;
+ Py_SETREF(result, r1);
buf = PyBytes_AS_STRING(result);
len = numnondigits + prec;
}
@@ -14552,8 +13770,7 @@ _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
|| buf != PyUnicode_DATA(result)) {
PyObject *unicode;
unicode = _PyUnicode_FromASCII(buf, len);
- Py_DECREF(result);
- result = unicode;
+ Py_SETREF(result, unicode);
}
else if (len != PyUnicode_GET_LENGTH(result)) {
if (PyUnicode_Resize(&result, len) < 0)
@@ -14594,8 +13811,7 @@ mainformatlong(PyObject *v,
assert(PyLong_Check(iobj));
}
else {
- iobj = v;
- Py_INCREF(iobj);
+ iobj = Py_NewRef(v);
}
if (PyLong_CheckExact(v)
@@ -14918,8 +14134,7 @@ unicode_format_arg_format(struct unicode_formatter_t *ctx,
}
if (PyUnicode_CheckExact(v) && arg->ch == 's') {
- *p_str = v;
- Py_INCREF(*p_str);
+ *p_str = Py_NewRef(v);
}
else {
if (arg->ch == 's')
@@ -15001,7 +14216,7 @@ unicode_format_arg_output(struct unicode_formatter_t *ctx,
PyObject *str)
{
Py_ssize_t len;
- enum PyUnicode_Kind kind;
+ int kind;
const void *pbuf;
Py_ssize_t pindex;
Py_UCS4 signchar;
@@ -15015,9 +14230,6 @@ unicode_format_arg_output(struct unicode_formatter_t *ctx,
if (arg->sign && arg->flags & F_ZERO)
fill = '0';
- if (PyUnicode_READY(str) == -1)
- return -1;
-
len = PyUnicode_GET_LENGTH(str);
if ((arg->width == -1 || arg->width <= len)
&& (arg->prec == -1 || arg->prec >= len)
@@ -15319,15 +14531,12 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
{
PyObject *self;
Py_ssize_t length, char_size;
- int share_wstr, share_utf8;
- unsigned int kind;
+ int share_utf8;
+ int kind;
void *data;
assert(PyType_IsSubtype(type, &PyUnicode_Type));
assert(_PyUnicode_CHECK(unicode));
- if (PyUnicode_READY(unicode) == -1) {
- return NULL;
- }
self = type->tp_alloc(type, 0);
if (self == NULL) {
@@ -15346,15 +14555,12 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_STATE(self).kind = kind;
_PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
- _PyUnicode_STATE(self).ready = 1;
- _PyUnicode_WSTR(self) = NULL;
+ _PyUnicode_STATE(self).statically_allocated = 0;
_PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL;
- _PyUnicode_WSTR_LENGTH(self) = 0;
_PyUnicode_DATA_ANY(self) = NULL;
share_utf8 = 0;
- share_wstr = 0;
if (kind == PyUnicode_1BYTE_KIND) {
char_size = 1;
if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
@@ -15362,14 +14568,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
}
else if (kind == PyUnicode_2BYTE_KIND) {
char_size = 2;
- if (sizeof(wchar_t) == 2)
- share_wstr = 1;
}
else {
assert(kind == PyUnicode_4BYTE_KIND);
char_size = 4;
- if (sizeof(wchar_t) == 4)
- share_wstr = 1;
}
/* Ensure we won't overflow the length. */
@@ -15388,13 +14590,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_UTF8_LENGTH(self) = length;
_PyUnicode_UTF8(self) = data;
}
- if (share_wstr) {
- _PyUnicode_WSTR_LENGTH(self) = length;
- _PyUnicode_WSTR(self) = (wchar_t *)data;
- }
- memcpy(data, PyUnicode_DATA(unicode),
- kind * (length + 1));
+ memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
assert(_PyUnicode_CheckConsistency(self, 1));
#ifdef Py_DEBUG
_PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
@@ -15474,12 +14671,14 @@ PyTypeObject PyUnicode_Type = {
/* Initialize the Unicode implementation */
-void
-_PyUnicode_InitState(PyInterpreterState *interp)
+static void
+_init_global_state(void)
{
- if (!_Py_IsMainInterpreter(interp)) {
+ static int initialized = 0;
+ if (initialized) {
return;
}
+ initialized = 1;
/* initialize the linebreak bloom filter */
const Py_UCS2 linebreak[] = {
@@ -15497,21 +14696,42 @@ _PyUnicode_InitState(PyInterpreterState *interp)
Py_ARRAY_LENGTH(linebreak));
}
+void
+_PyUnicode_InitState(PyInterpreterState *interp)
+{
+ if (!_Py_IsMainInterpreter(interp)) {
+ return;
+ }
+ _init_global_state();
+}
+
PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
{
- if (!_Py_IsMainInterpreter(interp)) {
- return _PyStatus_OK();
+ // Initialize the global interned dict
+ if (init_interned_dict(interp)) {
+ PyErr_Clear();
+ return _PyStatus_ERR("failed to create interned dict");
}
+ if (_Py_IsMainInterpreter(interp)) {
+ /* Intern statically allocated string identifiers and deepfreeze strings.
+ * This must be done before any module initialization so that statically
+ * allocated string identifiers are used instead of heap allocated strings.
+ * Deepfreeze uses the interned identifiers if present to save space
+ * else generates them and they are interned to speed up dict lookups.
+ */
+ _PyUnicode_InitStaticStrings(interp);
+
#ifdef Py_DEBUG
- assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
+ assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
- for (int i = 0; i < 256; i++) {
- assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
- }
+ for (int i = 0; i < 256; i++) {
+ assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
+ }
#endif
+ }
return _PyStatus_OK();
}
@@ -15520,17 +14740,13 @@ _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
PyStatus
_PyUnicode_InitTypes(PyInterpreterState *interp)
{
- if (!_Py_IsMainInterpreter(interp)) {
- return _PyStatus_OK();
- }
-
- if (PyType_Ready(&EncodingMapType) < 0) {
+ if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
goto error;
}
- if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
+ if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
goto error;
}
- if (PyType_Ready(&PyFormatterIter_Type) < 0) {
+ if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
goto error;
}
return _PyStatus_OK();
@@ -15541,7 +14757,7 @@ error:
void
-PyUnicode_InternInPlace(PyObject **p)
+_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
{
PyObject *s = *p;
#ifdef Py_DEBUG
@@ -15563,19 +14779,26 @@ PyUnicode_InternInPlace(PyObject **p)
return;
}
- if (PyUnicode_READY(s) == -1) {
- PyErr_Clear();
+ /* Look in the global cache first. */
+ PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
+ if (r != NULL && r != s) {
+ Py_SETREF(*p, Py_NewRef(r));
return;
}
- if (interned == NULL) {
- interned = PyDict_New();
- if (interned == NULL) {
- PyErr_Clear(); /* Don't leave an exception */
- return;
+ /* Handle statically allocated strings. */
+ if (_PyUnicode_STATE(s).statically_allocated) {
+ assert(_Py_IsImmortal(s));
+ if (_Py_hashtable_set(INTERNED_STRINGS, s, s) == 0) {
+ _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
}
+ return;
}
+ /* Look in the per-interpreter cache. */
+ PyObject *interned = get_interned_dict(interp);
+ assert(interned != NULL);
+
PyObject *t = PyDict_SetDefault(interned, s, s);
if (t == NULL) {
PyErr_Clear();
@@ -15583,35 +14806,44 @@ PyUnicode_InternInPlace(PyObject **p)
}
if (t != s) {
- Py_INCREF(t);
- Py_SETREF(*p, t);
+ Py_SETREF(*p, Py_NewRef(t));
+ return;
+ }
+
+ if (_Py_IsImmortal(s)) {
+ // XXX Restrict this to the main interpreter?
+ _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
return;
}
- /* The two references in interned dict (key and value) are not counted by
- refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
- this. */
- Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
- _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
+#ifdef Py_REF_DEBUG
+ /* The reference count value excluding the 2 references from the
+ interned dictionary should be excluded from the RefTotal. The
+ decrements to these objects will not be registered so they
+ need to be accounted for in here. */
+ for (Py_ssize_t i = 0; i < Py_REFCNT(s) - 2; i++) {
+ _Py_DecRefTotal(_PyInterpreterState_GET());
+ }
+#endif
+ _Py_SetImmortal(s);
+ _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
+}
+
+void
+PyUnicode_InternInPlace(PyObject **p)
+{
+ PyInterpreterState *interp = _PyInterpreterState_GET();
+ _PyUnicode_InternInPlace(interp, p);
}
+// Function kept for the stable ABI.
+PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
void
PyUnicode_InternImmortal(PyObject **p)
{
- if (PyErr_WarnEx(PyExc_DeprecationWarning,
- "PyUnicode_InternImmortal() is deprecated; "
- "use PyUnicode_InternInPlace() instead", 1) < 0)
- {
- // The function has no return value, the exception cannot
- // be reported to the caller, so just log it.
- PyErr_WriteUnraisable(NULL);
- }
-
PyUnicode_InternInPlace(p);
- if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
- _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
- Py_INCREF(*p);
- }
+ // Leak a reference on purpose
+ Py_INCREF(*p);
}
PyObject *
@@ -15628,61 +14860,81 @@ PyUnicode_InternFromString(const char *cp)
void
_PyUnicode_ClearInterned(PyInterpreterState *interp)
{
- if (!_Py_IsMainInterpreter(interp)) {
- // interned dict is shared by all interpreters
- return;
- }
-
+ PyObject *interned = get_interned_dict(interp);
if (interned == NULL) {
return;
}
assert(PyDict_CheckExact(interned));
- /* Interned unicode strings are not forcibly deallocated; rather, we give
- them their stolen references back, and then clear and DECREF the
- interned dict. */
-
+ /* TODO:
+ * Currently, the runtime is not able to guarantee that it can exit without
+ * allocations that carry over to a future initialization of Python within
+ * the same process. i.e:
+ * ./python -X showrefcount -c 'import itertools'
+ * [237 refs, 237 blocks]
+ *
+ * Therefore, this should remain disabled for until there is a strict guarantee
+ * that no memory will be left after `Py_Finalize`.
+ */
+#ifdef Py_DEBUG
+ /* For all non-singleton interned strings, restore the two valid references
+ to that instance from within the intern string dictionary and let the
+ normal reference counting process clean up these instances. */
#ifdef INTERNED_STATS
fprintf(stderr, "releasing %zd interned strings\n",
PyDict_GET_SIZE(interned));
- Py_ssize_t immortal_size = 0, mortal_size = 0;
+ Py_ssize_t total_length = 0;
#endif
Py_ssize_t pos = 0;
PyObject *s, *ignored_value;
while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
assert(PyUnicode_IS_READY(s));
-
+ int shared = 0;
switch (PyUnicode_CHECK_INTERNED(s)) {
case SSTATE_INTERNED_IMMORTAL:
- Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
-#ifdef INTERNED_STATS
- immortal_size += PyUnicode_GET_LENGTH(s);
-#endif
- break;
- case SSTATE_INTERNED_MORTAL:
- // Restore the two references (key and value) ignored
+ // Skip the Immortal Instance check and restore
+ // the two references (key and value) ignored
// by PyUnicode_InternInPlace().
- Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
+ s->ob_refcnt = 2;
#ifdef INTERNED_STATS
- mortal_size += PyUnicode_GET_LENGTH(s);
+ total_length += PyUnicode_GET_LENGTH(s);
#endif
break;
+ case SSTATE_INTERNED_IMMORTAL_STATIC:
+ /* It is shared between interpreters, so we should unmark it
+ only when this is the last interpreter in which it's
+ interned. We immortalize all the statically initialized
+ strings during startup, so we can rely on the
+ main interpreter to be the last one. */
+ if (!_Py_IsMainInterpreter(interp)) {
+ shared = 1;
+ }
+ break;
+ case SSTATE_INTERNED_MORTAL:
+ /* fall through */
case SSTATE_NOT_INTERNED:
/* fall through */
default:
Py_UNREACHABLE();
}
- _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
+ if (!shared) {
+ _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
+ }
}
#ifdef INTERNED_STATS
fprintf(stderr,
- "total size of all interned strings: %zd/%zd mortal/immortal\n",
- mortal_size, immortal_size);
+ "total length of all interned strings: %zd characters\n",
+ total_length);
#endif
- PyDict_Clear(interned);
- Py_CLEAR(interned);
+ struct _Py_unicode_state *state = &interp->unicode;
+ struct _Py_unicode_ids *ids = &state->ids;
+ for (Py_ssize_t i=0; i < ids->size; i++) {
+ Py_XINCREF(ids->array[i]);
+ }
+#endif /* Py_DEBUG */
+ clear_interned_dict(interp);
}
@@ -15779,7 +15031,7 @@ unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
if (it->it_seq != NULL) {
return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
} else {
- PyObject *u = (PyObject *)_PyUnicode_New(0);
+ PyObject *u = unicode_new_empty();
if (u == NULL) {
Py_XDECREF(iter);
return NULL;
@@ -15873,8 +15125,6 @@ unicode_iter(PyObject *seq)
PyErr_BadInternalCall();
return NULL;
}
- if (PyUnicode_READY(seq) == -1)
- return NULL;
if (PyUnicode_IS_COMPACT_ASCII(seq)) {
it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
}
@@ -15884,8 +15134,7 @@ unicode_iter(PyObject *seq)
if (it == NULL)
return NULL;
it->it_index = 0;
- Py_INCREF(seq);
- it->it_seq = seq;
+ it->it_seq = Py_NewRef(seq);
_PyObject_GC_TRACK(it);
return (PyObject *)it;
}
@@ -16012,10 +15261,13 @@ init_fs_codec(PyInterpreterState *interp)
/* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
global configuration variables. */
- if (_Py_SetFileSystemEncoding(fs_codec->encoding,
- fs_codec->errors) < 0) {
- PyErr_NoMemory();
- return -1;
+ if (_Py_IsMainInterpreter(interp)) {
+
+ if (_Py_SetFileSystemEncoding(fs_codec->encoding,
+ fs_codec->errors) < 0) {
+ PyErr_NoMemory();
+ return -1;
+ }
}
return 0;
}
@@ -16098,7 +15350,7 @@ _PyUnicode_EnableLegacyWindowsFSEncoding(void)
static inline int
unicode_is_finalizing(void)
{
- return (interned == NULL);
+ return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
}
#endif
@@ -16106,42 +15358,9 @@ unicode_is_finalizing(void)
void
_PyUnicode_FiniTypes(PyInterpreterState *interp)
{
- if (!_Py_IsMainInterpreter(interp)) {
- return;
- }
-
- _PyStaticType_Dealloc(&EncodingMapType);
- _PyStaticType_Dealloc(&PyFieldNameIter_Type);
- _PyStaticType_Dealloc(&PyFormatterIter_Type);
-}
-
-
-static void unicode_static_dealloc(PyObject *op)
-{
- PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
-
- assert(ascii->state.compact);
-
- if (ascii->state.ascii) {
- if (ascii->wstr) {
- PyObject_Free(ascii->wstr);
- ascii->wstr = NULL;
- }
- }
- else {
- PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
- void* data = (void*)(compact + 1);
- if (ascii->wstr && ascii->wstr != data) {
- PyObject_Free(ascii->wstr);
- ascii->wstr = NULL;
- compact->wstr_length = 0;
- }
- if (compact->utf8) {
- PyObject_Free(compact->utf8);
- compact->utf8 = NULL;
- compact->utf8_length = 0;
- }
- }
+ _PyStaticType_Dealloc(interp, &EncodingMapType);
+ _PyStaticType_Dealloc(interp, &PyFieldNameIter_Type);
+ _PyStaticType_Dealloc(interp, &PyFormatterIter_Type);
}
@@ -16150,35 +15369,18 @@ _PyUnicode_Fini(PyInterpreterState *interp)
{
struct _Py_unicode_state *state = &interp->unicode;
- if (_Py_IsMainInterpreter(interp)) {
- // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
- assert(interned == NULL);
- // bpo-47182: force a unicodedata CAPI capsule re-import on
- // subsequent initialization of main interpreter.
- ucnhash_capi = NULL;
- }
+ // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
+ assert(get_interned_dict(interp) == NULL);
_PyUnicode_FiniEncodings(&state->fs_codec);
- unicode_clear_identifiers(state);
-
- // Clear the single character singletons
- for (int i = 0; i < 128; i++) {
- unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
- }
- for (int i = 0; i < 128; i++) {
- unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
- }
-}
-
+ // bpo-47182: force a unicodedata CAPI capsule re-import on
+ // subsequent initialization of interpreter.
+ interp->unicode.ucnhash_capi = NULL;
-void
-_PyStaticUnicode_Dealloc(PyObject *op)
-{
- unicode_static_dealloc(op);
+ unicode_clear_identifiers(state);
}
-
/* A _string module, to export formatter_parser and formatter_field_name_split
to the string.Formatter class implemented in Python. */
@@ -16190,12 +15392,18 @@ static PyMethodDef _string_methods[] = {
{NULL, NULL}
};
+static PyModuleDef_Slot module_slots[] = {
+ {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+ {0, NULL}
+};
+
static struct PyModuleDef _string_module = {
PyModuleDef_HEAD_INIT,
.m_name = "_string",
.m_doc = PyDoc_STR("string helper module"),
.m_size = 0,
.m_methods = _string_methods,
+ .m_slots = module_slots,
};
PyMODINIT_FUNC