summaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/src/Include/cpython/unicodeobject.h
diff options
context:
space:
mode:
authorAlexSm <[email protected]>2024-02-16 11:51:30 +0100
committerGitHub <[email protected]>2024-02-16 11:51:30 +0100
commit506ecaee93b52cc12c2e2f97c3d42e3ca2a7f59e (patch)
treed096fb9eb988fbb0ca1ba970041773207ce3aa70 /contrib/tools/python3/src/Include/cpython/unicodeobject.h
parent4749b9e5d260714490997e6f5ee1ee8c1c8fc46c (diff)
parentf200f72c9d7a89c1018e3dc6b46c49fe2ecf84fb (diff)
Merge pull request #1940 from dcherednik/importlib
Library import 14
Diffstat (limited to 'contrib/tools/python3/src/Include/cpython/unicodeobject.h')
-rw-r--r--contrib/tools/python3/src/Include/cpython/unicodeobject.h442
1 files changed, 126 insertions, 316 deletions
diff --git a/contrib/tools/python3/src/Include/cpython/unicodeobject.h b/contrib/tools/python3/src/Include/cpython/unicodeobject.h
index 84307d18854..f177cd9e2af 100644
--- a/contrib/tools/python3/src/Include/cpython/unicodeobject.h
+++ b/contrib/tools/python3/src/Include/cpython/unicodeobject.h
@@ -11,63 +11,43 @@
/* --- Internal Unicode Operations ---------------------------------------- */
-#ifndef USE_UNICODE_WCHAR_CACHE
-# define USE_UNICODE_WCHAR_CACHE 1
-#endif /* USE_UNICODE_WCHAR_CACHE */
-
-/* Since splitting on whitespace is an important use case, and
- whitespace in most situations is solely ASCII whitespace, we
- optimize for the common case by using a quick look-up table
- _Py_ascii_whitespace (see below) with an inlined check.
-
- */
-#define Py_UNICODE_ISSPACE(ch) \
- ((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
-
-#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
-#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
-#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
-#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
-
-#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
-#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
-#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
-
-#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
-#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
-#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
-#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
-
-#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
-#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
-#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
+// Static inline functions to work with surrogates
+static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
+ return (0xD800 <= ch && ch <= 0xDFFF);
+}
+static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
+ return (0xD800 <= ch && ch <= 0xDBFF);
+}
+static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
+ return (0xDC00 <= ch && ch <= 0xDFFF);
+}
-#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
+// Join two surrogate characters and return a single Py_UCS4 value.
+static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
+ assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+ assert(Py_UNICODE_IS_LOW_SURROGATE(low));
+ return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
+}
-#define Py_UNICODE_ISALNUM(ch) \
- (Py_UNICODE_ISALPHA(ch) || \
- Py_UNICODE_ISDECIMAL(ch) || \
- Py_UNICODE_ISDIGIT(ch) || \
- Py_UNICODE_ISNUMERIC(ch))
+// High surrogate = top 10 bits added to 0xD800.
+// The character must be in the range [U+10000; U+10ffff].
+static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
+ assert(0x10000 <= ch && ch <= 0x10ffff);
+ return (0xD800 - (0x10000 >> 10) + (ch >> 10));
+}
-/* macros to work with surrogates */
-#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
-#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
-#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
-/* Join two surrogate characters and return a single Py_UCS4 value. */
-#define Py_UNICODE_JOIN_SURROGATES(high, low) \
- (((((Py_UCS4)(high) & 0x03FF) << 10) | \
- ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
-/* high surrogate = top 10 bits added to D800 */
-#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
-/* low surrogate = bottom 10 bits added to DC00 */
-#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
+// Low surrogate = bottom 10 bits added to 0xDC00.
+// The character must be in the range [U+10000; U+10ffff].
+static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
+ assert(0x10000 <= ch && ch <= 0x10ffff);
+ return (0xDC00 + (ch & 0x3FF));
+}
/* --- Unicode Type ------------------------------------------------------- */
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
structure. state.ascii and state.compact are set, and the data
- immediately follow the structure. utf8_length and wstr_length can be found
+ immediately follow the structure. utf8_length can be found
in the length field; the utf8 pointer is equal to the data pointer. */
typedef struct {
/* There are 4 forms of Unicode strings:
@@ -79,8 +59,7 @@ typedef struct {
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
- * ready = 1
- * (length is the length of the utf8 and wstr strings)
+ * (length is the length of the utf8)
* (data starts just after the structure)
* (since ASCII is decoded from UTF-8, the utf8 string are the data)
@@ -91,55 +70,27 @@ typedef struct {
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 1
- * ready = 1
* ascii = 0
* utf8 is not shared with data
* utf8_length = 0 if utf8 is NULL
- * wstr is shared with data and wstr_length=length
- if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
- or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
- * wstr_length = 0 if wstr is NULL
* (data starts just after the structure)
- - legacy string, not ready:
-
- * structure = PyUnicodeObject
- * test: kind == PyUnicode_WCHAR_KIND
- * length = 0 (use wstr_length)
- * hash = -1
- * kind = PyUnicode_WCHAR_KIND
- * compact = 0
- * ascii = 0
- * ready = 0
- * interned = SSTATE_NOT_INTERNED
- * wstr is not NULL
- * data.any is NULL
- * utf8 is NULL
- * utf8_length = 0
-
- - legacy string, ready:
+ - legacy string:
* structure = PyUnicodeObject structure
- * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
+ * test: !PyUnicode_IS_COMPACT(op)
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 0
- * ready = 1
* data.any is not NULL
* utf8 is shared and utf8_length = length with data.any if ascii = 1
* utf8_length = 0 if utf8 is NULL
- * wstr is shared with data.any and wstr_length = length
- if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
- or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
- * wstr_length = 0 if wstr is NULL
Compact strings use only one memory block (structure + characters),
whereas legacy strings use one block for the structure and one block
for characters.
- Legacy strings are created by PyUnicode_FromUnicode() and
- PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
- when PyUnicode_READY() is called.
+ Legacy strings are created by subclasses of Unicode.
See also _PyUnicode_CheckConsistency().
*/
@@ -147,22 +98,18 @@ typedef struct {
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
struct {
- /*
- SSTATE_NOT_INTERNED (0)
- SSTATE_INTERNED_MORTAL (1)
- SSTATE_INTERNED_IMMORTAL (2)
-
- If interned != SSTATE_NOT_INTERNED, the two references from the
+ /* If interned is non-zero, the two references from the
dictionary to this object are *not* counted in ob_refcnt.
- */
+ The possible values here are:
+ 0: Not Interned
+ 1: Interned
+ 2: Interned and Immortal
+ 3: Interned, Immortal, and Static
+ This categorization allows the runtime to determine the right
+ cleanup mechanism at runtime shutdown. */
unsigned int interned:2;
/* Character size:
- - PyUnicode_WCHAR_KIND (0):
-
- * character type = wchar_t (16 or 32 bits, depending on the
- platform)
-
- PyUnicode_1BYTE_KIND (1):
* character type = Py_UCS1 (8 bits, unsigned)
@@ -193,16 +140,12 @@ typedef struct {
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
unsigned int ascii:1;
- /* The ready flag indicates whether the object layout is initialized
- completely. This means that this is either a compact object, or
- the data pointer is filled out. The bit is redundant, and helps
- to minimize the test in PyUnicode_IS_READY(). */
- unsigned int ready:1;
+ /* The object is statically allocated. */
+ unsigned int statically_allocated:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :24;
} state;
- wchar_t *wstr; /* wchar_t representation (null-terminated) */
} PyASCIIObject;
/* Non-ASCII strings allocated through PyUnicode_New use the
@@ -213,13 +156,9 @@ typedef struct {
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
- Py_ssize_t wstr_length; /* Number of code points in wstr, possible
- * surrogates count as two code points. */
} PyCompactUnicodeObject;
-/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
- PyUnicodeObject structure. The actual string data is initially in the wstr
- block, and copied into the data block using _PyUnicode_Ready. */
+/* Object format for Unicode subclasses. */
typedef struct {
PyCompactUnicodeObject _base;
union {
@@ -254,68 +193,56 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2
+#define SSTATE_INTERNED_IMMORTAL_STATIC 3
/* Use only if you know it's a string */
static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.interned;
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
-#endif
+#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
-/* Fast check to determine whether an object is ready. Equivalent to:
- PyUnicode_IS_COMPACT(op) || _PyUnicodeObject_CAST(op)->data.any */
-static inline unsigned int PyUnicode_IS_READY(PyObject *op) {
- return _PyASCIIObject_CAST(op)->state.ready;
+/* For backward compatibility */
+static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
+ return 1;
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
-#endif
+#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
/* Return true if the string contains only ASCII characters, or 0 if not. The
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
ready. */
static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
- assert(PyUnicode_IS_READY(op));
return _PyASCIIObject_CAST(op)->state.ascii;
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
-#endif
+#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
/* Return true if the string is compact or 0 if not.
No type checks or Ready calls are performed. */
static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.compact;
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
-#endif
+#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
/* Return true if the string is a compact ASCII string (use PyASCIIObject
structure), or 0 if not. No type checks or Ready calls are performed. */
static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
-#endif
+#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
enum PyUnicode_Kind {
-/* String contains only wstr byte characters. This is only possible
- when the string was created with a legacy API and _PyUnicode_Ready()
- has not been called yet. */
- PyUnicode_WCHAR_KIND = 0,
/* Return values of the PyUnicode_KIND() function: */
PyUnicode_1BYTE_KIND = 1,
PyUnicode_2BYTE_KIND = 2,
PyUnicode_4BYTE_KIND = 4
};
-/* Return one of the PyUnicode_*_KIND values defined above. */
-#define PyUnicode_KIND(op) \
- (assert(PyUnicode_IS_READY(op)), \
- _PyASCIIObject_CAST(op)->state.kind)
+// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
+//
+// gh-89653: Converting this macro to a static inline function would introduce
+// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
+// unsigned numbers) where kind type is an int or on
+// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
+#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
/* Return a void pointer to the raw unicode buffer. */
static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
@@ -339,9 +266,7 @@ static inline void* PyUnicode_DATA(PyObject *op) {
}
return _PyUnicode_NONCOMPACT_DATA(op);
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
-#endif
+#define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
/* Return pointers to the canonical representation cast to unsigned char,
Py_UCS2, or Py_UCS4 for direct character access.
@@ -352,16 +277,11 @@ static inline void* PyUnicode_DATA(PyObject *op) {
#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
-/* Returns the length of the unicode string. The caller has to make sure that
- the string has it's canonical representation set before calling
- this function. Call PyUnicode_(FAST_)Ready to ensure that. */
+/* Returns the length of the unicode string. */
static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
- assert(PyUnicode_IS_READY(op));
return _PyASCIIObject_CAST(op)->length;
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
-#endif
+#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
/* Write into the canonical representation, this function does not do any sanity
checks and is intended for usage in loops. The caller should cache the
@@ -371,6 +291,7 @@ static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
static inline void PyUnicode_WRITE(int kind, void *data,
Py_ssize_t index, Py_UCS4 value)
{
+ assert(index >= 0);
if (kind == PyUnicode_1BYTE_KIND) {
assert(value <= 0xffU);
_Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
@@ -385,17 +306,16 @@ static inline void PyUnicode_WRITE(int kind, void *data,
_Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
}
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
#define PyUnicode_WRITE(kind, data, index, value) \
PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
(index), _Py_STATIC_CAST(Py_UCS4, value))
-#endif
/* Read a code point from the string's canonical representation. No checks
or ready calls are performed. */
static inline Py_UCS4 PyUnicode_READ(int kind,
const void *data, Py_ssize_t index)
{
+ assert(index >= 0);
if (kind == PyUnicode_1BYTE_KIND) {
return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
}
@@ -405,12 +325,10 @@ static inline Py_UCS4 PyUnicode_READ(int kind,
assert(kind == PyUnicode_4BYTE_KIND);
return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
#define PyUnicode_READ(kind, data, index) \
PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
_Py_STATIC_CAST(const void*, data), \
(index))
-#endif
/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
calls PyUnicode_KIND() and might call it twice. For single reads, use
@@ -419,7 +337,11 @@ static inline Py_UCS4 PyUnicode_READ(int kind,
static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
{
int kind;
- assert(PyUnicode_IS_READY(unicode));
+
+ assert(index >= 0);
+ // Tolerate reading the NUL character at str[len(str)]
+ assert(index <= PyUnicode_GET_LENGTH(unicode));
+
kind = PyUnicode_KIND(unicode);
if (kind == PyUnicode_1BYTE_KIND) {
return PyUnicode_1BYTE_DATA(unicode)[index];
@@ -430,10 +352,8 @@ static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
assert(kind == PyUnicode_4BYTE_KIND);
return PyUnicode_4BYTE_DATA(unicode)[index];
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_READ_CHAR(unicode, index) \
- PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
-#endif
+#define PyUnicode_READ_CHAR(unicode, index) \
+ PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
/* Return a maximum character value which is suitable for creating another
string based on op. This is always an approximation but more efficient
@@ -442,7 +362,6 @@ static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
{
int kind;
- assert(PyUnicode_IS_READY(op));
if (PyUnicode_IS_ASCII(op)) {
return 0x7fU;
}
@@ -457,10 +376,8 @@ static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
assert(kind == PyUnicode_4BYTE_KIND);
return 0x10ffffU;
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_MAX_CHAR_VALUE(op) \
- PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
-#endif
+#define PyUnicode_MAX_CHAR_VALUE(op) \
+ PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
/* === Public API ========================================================= */
@@ -474,31 +391,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_New(
Py_UCS4 maxchar /* maximum code point value in the string */
);
-/* Initializes the canonical string representation from the deprecated
- wstr/Py_UNICODE representation. This function is used to convert Unicode
- objects which were created using the old API to the new flexible format
- introduced with PEP 393.
-
- Don't call this function directly, use the public PyUnicode_READY() function
- instead. */
-PyAPI_FUNC(int) _PyUnicode_Ready(
- PyObject *unicode /* Unicode object */
- );
-
-/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
- case. If the canonical representation is not yet set, it will still call
- _PyUnicode_Ready().
- Returns 0 on success and -1 on errors. */
-static inline int PyUnicode_READY(PyObject *op)
+/* For backward compatibility */
+static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
{
- if (PyUnicode_IS_READY(op)) {
- return 0;
- }
- return _PyUnicode_Ready(op);
+ return 0;
}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
-#endif
+#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
/* Get a copy of a Unicode string. */
PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
@@ -586,139 +484,12 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
Py_ssize_t start,
Py_ssize_t end);
-/* --- Legacy deprecated API ---------------------------------------------- */
-
-/* Create a Unicode Object from the Py_UNICODE buffer u of the given
- size.
-
- u may be NULL which causes the contents to be undefined. It is the
- user's responsibility to fill in the needed data afterwards. Note
- that modifying the Unicode object contents after construction is
- only allowed if u was set to NULL.
-
- The buffer is copied into the new object. */
-Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
- const Py_UNICODE *u, /* Unicode buffer */
- Py_ssize_t size /* size of buffer */
- );
-
-/* Return a read-only pointer to the Unicode object's internal
- Py_UNICODE buffer.
- If the wchar_t/Py_UNICODE representation is not yet available, this
- function will calculate it. */
-Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
- PyObject *unicode /* Unicode object */
- );
-
-/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
- contains null characters. */
-PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
- PyObject *unicode /* Unicode object */
- );
-
-/* Return a read-only pointer to the Unicode object's internal
- Py_UNICODE buffer and save the length at size.
- If the wchar_t/Py_UNICODE representation is not yet available, this
- function will calculate it. */
-
-Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
- PyObject *unicode, /* Unicode object */
- Py_ssize_t *size /* location where to save the length */
- );
-
-
-/* Fast access macros */
-
-Py_DEPRECATED(3.3)
-static inline Py_ssize_t PyUnicode_WSTR_LENGTH(PyObject *op)
-{
- if (PyUnicode_IS_COMPACT_ASCII(op)) {
- return _PyASCIIObject_CAST(op)->length;
- }
- else {
- return _PyCompactUnicodeObject_CAST(op)->wstr_length;
- }
-}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_WSTR_LENGTH(op) PyUnicode_WSTR_LENGTH(_PyObject_CAST(op))
-#endif
-
-/* Returns the deprecated Py_UNICODE representation's size in code units
- (this includes surrogate pairs as 2 units).
- If the Py_UNICODE representation is not available, it will be computed
- on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
-
-Py_DEPRECATED(3.3)
-static inline Py_ssize_t PyUnicode_GET_SIZE(PyObject *op)
-{
- _Py_COMP_DIAG_PUSH
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
- if (_PyASCIIObject_CAST(op)->wstr == _Py_NULL) {
- (void)PyUnicode_AsUnicode(op);
- assert(_PyASCIIObject_CAST(op)->wstr != _Py_NULL);
- }
- return PyUnicode_WSTR_LENGTH(op);
- _Py_COMP_DIAG_POP
-}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_GET_SIZE(op) PyUnicode_GET_SIZE(_PyObject_CAST(op))
-#endif
-
-Py_DEPRECATED(3.3)
-static inline Py_ssize_t PyUnicode_GET_DATA_SIZE(PyObject *op)
-{
- _Py_COMP_DIAG_PUSH
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
- return PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE;
- _Py_COMP_DIAG_POP
-}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_GET_DATA_SIZE(op) PyUnicode_GET_DATA_SIZE(_PyObject_CAST(op))
-#endif
-
-/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
- representation on demand. Using this macro is very inefficient now,
- try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
- use PyUnicode_WRITE() and PyUnicode_READ(). */
-
-Py_DEPRECATED(3.3)
-static inline Py_UNICODE* PyUnicode_AS_UNICODE(PyObject *op)
-{
- wchar_t *wstr = _PyASCIIObject_CAST(op)->wstr;
- if (wstr != _Py_NULL) {
- return wstr;
- }
-
- _Py_COMP_DIAG_PUSH
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
- return PyUnicode_AsUnicode(op);
- _Py_COMP_DIAG_POP
-}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_AS_UNICODE(op) PyUnicode_AS_UNICODE(_PyObject_CAST(op))
-#endif
-
-Py_DEPRECATED(3.3)
-static inline const char* PyUnicode_AS_DATA(PyObject *op)
-{
- _Py_COMP_DIAG_PUSH
- _Py_COMP_DIAG_IGNORE_DEPR_DECLS
- Py_UNICODE *data = PyUnicode_AS_UNICODE(op);
- // In C++, casting directly PyUnicode* to const char* is not valid
- return _Py_STATIC_CAST(const char*, _Py_STATIC_CAST(const void*, data));
- _Py_COMP_DIAG_POP
-}
-#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
-# define PyUnicode_AS_DATA(op) PyUnicode_AS_DATA(_PyObject_CAST(op))
-#endif
-
-
/* --- _PyUnicodeWriter API ----------------------------------------------- */
typedef struct {
PyObject *buffer;
void *data;
- enum PyUnicode_Kind kind;
+ int kind;
Py_UCS4 maxchar;
Py_ssize_t size;
Py_ssize_t pos;
@@ -769,8 +540,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Return 0 on success, raise an exception and return -1 on error. */
#define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
- (assert((KIND) != PyUnicode_WCHAR_KIND), \
- (KIND) <= (WRITER)->kind \
+ ((KIND) <= (WRITER)->kind \
? 0 \
: _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
@@ -778,7 +548,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
macro instead. */
PyAPI_FUNC(int)
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
- enum PyUnicode_Kind kind);
+ int kind);
/* Append a Unicode character.
Return 0 on success, raise an exception and return -1 on error. */
@@ -1024,10 +794,6 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
/* === Characters Type APIs =============================================== */
-/* Helper array used by Py_UNICODE_ISSPACE(). */
-
-PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
-
/* These should not be used directly. Use the Py_UNICODE_IS* and
Py_UNICODE_TO* macros instead.
@@ -1135,6 +901,50 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Py_UCS4 ch /* Unicode character */
);
+// Helper array used by Py_UNICODE_ISSPACE().
+PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
+
+// Since splitting on whitespace is an important use case, and
+// whitespace in most situations is solely ASCII whitespace, we
+// optimize for the common case by using a quick look-up table
+// _Py_ascii_whitespace (see below) with an inlined check.
+static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
+ if (ch < 128) {
+ return _Py_ascii_whitespace[ch];
+ }
+ return _PyUnicode_IsWhitespace(ch);
+}
+
+#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
+#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
+#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
+#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
+
+#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
+#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
+#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
+
+#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
+#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
+#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
+#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
+
+#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
+#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
+#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
+
+#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
+
+static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
+ return (Py_UNICODE_ISALPHA(ch)
+ || Py_UNICODE_ISDECIMAL(ch)
+ || Py_UNICODE_ISDIGIT(ch)
+ || Py_UNICODE_ISNUMERIC(ch));
+}
+
+
+/* === Misc functions ===================================================== */
+
PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
@@ -1144,7 +954,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
and where the hash values are equal (i.e. a very probable match) */
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
-/* Equality check. Returns -1 on failure. */
+/* Equality check. */
PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);