From 31f2a419764a8ba77c2a970cfc80056c6cd06756 Mon Sep 17 00:00:00 2001 From: shadchin Date: Mon, 12 Feb 2024 07:53:52 +0300 Subject: Update Python from 3.11.8 to 3.12.2 --- .../python3/src/Include/cpython/unicodeobject.h | 442 ++++++--------------- 1 file changed, 126 insertions(+), 316 deletions(-) (limited to 'contrib/tools/python3/src/Include/cpython/unicodeobject.h') diff --git a/contrib/tools/python3/src/Include/cpython/unicodeobject.h b/contrib/tools/python3/src/Include/cpython/unicodeobject.h index 84307d18854..f177cd9e2af 100644 --- a/contrib/tools/python3/src/Include/cpython/unicodeobject.h +++ b/contrib/tools/python3/src/Include/cpython/unicodeobject.h @@ -11,63 +11,43 @@ /* --- Internal Unicode Operations ---------------------------------------- */ -#ifndef USE_UNICODE_WCHAR_CACHE -# define USE_UNICODE_WCHAR_CACHE 1 -#endif /* USE_UNICODE_WCHAR_CACHE */ - -/* Since splitting on whitespace is an important use case, and - whitespace in most situations is solely ASCII whitespace, we - optimize for the common case by using a quick look-up table - _Py_ascii_whitespace (see below) with an inlined check. - - */ -#define Py_UNICODE_ISSPACE(ch) \ - ((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) - -#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) -#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) -#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) -#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) - -#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) -#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) -#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) - -#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) -#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) -#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) -#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) +// Static inline functions to work with surrogates +static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) { + return (0xD800 <= ch && ch <= 0xDFFF); +} +static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) { + return (0xD800 <= ch && ch <= 0xDBFF); +} +static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) { + return (0xDC00 <= ch && ch <= 0xDFFF); +} -#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) -#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) -#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) +// Join two surrogate characters and return a single Py_UCS4 value. +static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) { + assert(Py_UNICODE_IS_HIGH_SURROGATE(high)); + assert(Py_UNICODE_IS_LOW_SURROGATE(low)); + return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF)); +} -#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) +// High surrogate = top 10 bits added to 0xD800. +// The character must be in the range [U+10000; U+10ffff]. +static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) { + assert(0x10000 <= ch && ch <= 0x10ffff); + return (0xD800 - (0x10000 >> 10) + (ch >> 10)); +} -#define Py_UNICODE_ISALNUM(ch) \ - (Py_UNICODE_ISALPHA(ch) || \ - Py_UNICODE_ISDECIMAL(ch) || \ - Py_UNICODE_ISDIGIT(ch) || \ - Py_UNICODE_ISNUMERIC(ch)) - -/* macros to work with surrogates */ -#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) -#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) -#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) -/* Join two surrogate characters and return a single Py_UCS4 value. */ -#define Py_UNICODE_JOIN_SURROGATES(high, low) \ - (((((Py_UCS4)(high) & 0x03FF) << 10) | \ - ((Py_UCS4)(low) & 0x03FF)) + 0x10000) -/* high surrogate = top 10 bits added to D800 */ -#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) -/* low surrogate = bottom 10 bits added to DC00 */ -#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) +// Low surrogate = bottom 10 bits added to 0xDC00. +// The character must be in the range [U+10000; U+10ffff]. +static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) { + assert(0x10000 <= ch && ch <= 0x10ffff); + return (0xDC00 + (ch & 0x3FF)); +} /* --- Unicode Type ------------------------------------------------------- */ /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject structure. state.ascii and state.compact are set, and the data - immediately follow the structure. utf8_length and wstr_length can be found + immediately follow the structure. utf8_length can be found in the length field; the utf8 pointer is equal to the data pointer. */ typedef struct { /* There are 4 forms of Unicode strings: @@ -79,8 +59,7 @@ typedef struct { * kind = PyUnicode_1BYTE_KIND * compact = 1 * ascii = 1 - * ready = 1 - * (length is the length of the utf8 and wstr strings) + * (length is the length of the utf8) * (data starts just after the structure) * (since ASCII is decoded from UTF-8, the utf8 string are the data) @@ -91,55 +70,27 @@ typedef struct { * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or PyUnicode_4BYTE_KIND * compact = 1 - * ready = 1 * ascii = 0 * utf8 is not shared with data * utf8_length = 0 if utf8 is NULL - * wstr is shared with data and wstr_length=length - if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 - or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 - * wstr_length = 0 if wstr is NULL * (data starts just after the structure) - - legacy string, not ready: - - * structure = PyUnicodeObject - * test: kind == PyUnicode_WCHAR_KIND - * length = 0 (use wstr_length) - * hash = -1 - * kind = PyUnicode_WCHAR_KIND - * compact = 0 - * ascii = 0 - * ready = 0 - * interned = SSTATE_NOT_INTERNED - * wstr is not NULL - * data.any is NULL - * utf8 is NULL - * utf8_length = 0 - - - legacy string, ready: + - legacy string: * structure = PyUnicodeObject structure - * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND + * test: !PyUnicode_IS_COMPACT(op) * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or PyUnicode_4BYTE_KIND * compact = 0 - * ready = 1 * data.any is not NULL * utf8 is shared and utf8_length = length with data.any if ascii = 1 * utf8_length = 0 if utf8 is NULL - * wstr is shared with data.any and wstr_length = length - if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 - or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 - * wstr_length = 0 if wstr is NULL Compact strings use only one memory block (structure + characters), whereas legacy strings use one block for the structure and one block for characters. - Legacy strings are created by PyUnicode_FromUnicode() and - PyUnicode_FromStringAndSize(NULL, size) functions. They become ready - when PyUnicode_READY() is called. + Legacy strings are created by subclasses of Unicode. See also _PyUnicode_CheckConsistency(). */ @@ -147,22 +98,18 @@ typedef struct { Py_ssize_t length; /* Number of code points in the string */ Py_hash_t hash; /* Hash value; -1 if not set */ struct { - /* - SSTATE_NOT_INTERNED (0) - SSTATE_INTERNED_MORTAL (1) - SSTATE_INTERNED_IMMORTAL (2) - - If interned != SSTATE_NOT_INTERNED, the two references from the + /* If interned is non-zero, the two references from the dictionary to this object are *not* counted in ob_refcnt. - */ + The possible values here are: + 0: Not Interned + 1: Interned + 2: Interned and Immortal + 3: Interned, Immortal, and Static + This categorization allows the runtime to determine the right + cleanup mechanism at runtime shutdown. */ unsigned int interned:2; /* Character size: - - PyUnicode_WCHAR_KIND (0): - - * character type = wchar_t (16 or 32 bits, depending on the - platform) - - PyUnicode_1BYTE_KIND (1): * character type = Py_UCS1 (8 bits, unsigned) @@ -193,16 +140,12 @@ typedef struct { and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is set, use the PyASCIIObject structure. */ unsigned int ascii:1; - /* The ready flag indicates whether the object layout is initialized - completely. This means that this is either a compact object, or - the data pointer is filled out. The bit is redundant, and helps - to minimize the test in PyUnicode_IS_READY(). */ - unsigned int ready:1; + /* The object is statically allocated. */ + unsigned int statically_allocated:1; /* Padding to ensure that PyUnicode_DATA() is always aligned to 4 bytes (see issue #19537 on m68k). */ unsigned int :24; } state; - wchar_t *wstr; /* wchar_t representation (null-terminated) */ } PyASCIIObject; /* Non-ASCII strings allocated through PyUnicode_New use the @@ -213,13 +156,9 @@ typedef struct { Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the * terminating \0. */ char *utf8; /* UTF-8 representation (null-terminated) */ - Py_ssize_t wstr_length; /* Number of code points in wstr, possible - * surrogates count as two code points. */ } PyCompactUnicodeObject; -/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the - PyUnicodeObject structure. The actual string data is initially in the wstr - block, and copied into the data block using _PyUnicode_Ready. */ +/* Object format for Unicode subclasses. */ typedef struct { PyCompactUnicodeObject _base; union { @@ -254,68 +193,56 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency( #define SSTATE_NOT_INTERNED 0 #define SSTATE_INTERNED_MORTAL 1 #define SSTATE_INTERNED_IMMORTAL 2 +#define SSTATE_INTERNED_IMMORTAL_STATIC 3 /* Use only if you know it's a string */ static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) { return _PyASCIIObject_CAST(op)->state.interned; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op)) -#endif +#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op)) -/* Fast check to determine whether an object is ready. Equivalent to: - PyUnicode_IS_COMPACT(op) || _PyUnicodeObject_CAST(op)->data.any */ -static inline unsigned int PyUnicode_IS_READY(PyObject *op) { - return _PyASCIIObject_CAST(op)->state.ready; +/* For backward compatibility */ +static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) { + return 1; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op)) -#endif +#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op)) /* Return true if the string contains only ASCII characters, or 0 if not. The string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be ready. */ static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) { - assert(PyUnicode_IS_READY(op)); return _PyASCIIObject_CAST(op)->state.ascii; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op)) -#endif +#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op)) /* Return true if the string is compact or 0 if not. No type checks or Ready calls are performed. */ static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) { return _PyASCIIObject_CAST(op)->state.compact; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op)) -#endif +#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op)) /* Return true if the string is a compact ASCII string (use PyASCIIObject structure), or 0 if not. No type checks or Ready calls are performed. */ static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) { return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op)); } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op)) -#endif +#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op)) enum PyUnicode_Kind { -/* String contains only wstr byte characters. This is only possible - when the string was created with a legacy API and _PyUnicode_Ready() - has not been called yet. */ - PyUnicode_WCHAR_KIND = 0, /* Return values of the PyUnicode_KIND() function: */ PyUnicode_1BYTE_KIND = 1, PyUnicode_2BYTE_KIND = 2, PyUnicode_4BYTE_KIND = 4 }; -/* Return one of the PyUnicode_*_KIND values defined above. */ -#define PyUnicode_KIND(op) \ - (assert(PyUnicode_IS_READY(op)), \ - _PyASCIIObject_CAST(op)->state.kind) +// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above. +// +// gh-89653: Converting this macro to a static inline function would introduce +// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and +// unsigned numbers) where kind type is an int or on +// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned). +#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind) /* Return a void pointer to the raw unicode buffer. */ static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) { @@ -339,9 +266,7 @@ static inline void* PyUnicode_DATA(PyObject *op) { } return _PyUnicode_NONCOMPACT_DATA(op); } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op)) -#endif +#define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op)) /* Return pointers to the canonical representation cast to unsigned char, Py_UCS2, or Py_UCS4 for direct character access. @@ -352,16 +277,11 @@ static inline void* PyUnicode_DATA(PyObject *op) { #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op)) #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op)) -/* Returns the length of the unicode string. The caller has to make sure that - the string has it's canonical representation set before calling - this function. Call PyUnicode_(FAST_)Ready to ensure that. */ +/* Returns the length of the unicode string. */ static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) { - assert(PyUnicode_IS_READY(op)); return _PyASCIIObject_CAST(op)->length; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op)) -#endif +#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op)) /* Write into the canonical representation, this function does not do any sanity checks and is intended for usage in loops. The caller should cache the @@ -371,6 +291,7 @@ static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) { static inline void PyUnicode_WRITE(int kind, void *data, Py_ssize_t index, Py_UCS4 value) { + assert(index >= 0); if (kind == PyUnicode_1BYTE_KIND) { assert(value <= 0xffU); _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value); @@ -385,17 +306,16 @@ static inline void PyUnicode_WRITE(int kind, void *data, _Py_STATIC_CAST(Py_UCS4*, data)[index] = value; } } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 #define PyUnicode_WRITE(kind, data, index, value) \ PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \ (index), _Py_STATIC_CAST(Py_UCS4, value)) -#endif /* Read a code point from the string's canonical representation. No checks or ready calls are performed. */ static inline Py_UCS4 PyUnicode_READ(int kind, const void *data, Py_ssize_t index) { + assert(index >= 0); if (kind == PyUnicode_1BYTE_KIND) { return _Py_STATIC_CAST(const Py_UCS1*, data)[index]; } @@ -405,12 +325,10 @@ static inline Py_UCS4 PyUnicode_READ(int kind, assert(kind == PyUnicode_4BYTE_KIND); return _Py_STATIC_CAST(const Py_UCS4*, data)[index]; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 #define PyUnicode_READ(kind, data, index) \ PyUnicode_READ(_Py_STATIC_CAST(int, kind), \ _Py_STATIC_CAST(const void*, data), \ (index)) -#endif /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it calls PyUnicode_KIND() and might call it twice. For single reads, use @@ -419,7 +337,11 @@ static inline Py_UCS4 PyUnicode_READ(int kind, static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index) { int kind; - assert(PyUnicode_IS_READY(unicode)); + + assert(index >= 0); + // Tolerate reading the NUL character at str[len(str)] + assert(index <= PyUnicode_GET_LENGTH(unicode)); + kind = PyUnicode_KIND(unicode); if (kind == PyUnicode_1BYTE_KIND) { return PyUnicode_1BYTE_DATA(unicode)[index]; @@ -430,10 +352,8 @@ static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index) assert(kind == PyUnicode_4BYTE_KIND); return PyUnicode_4BYTE_DATA(unicode)[index]; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_READ_CHAR(unicode, index) \ - PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index)) -#endif +#define PyUnicode_READ_CHAR(unicode, index) \ + PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index)) /* Return a maximum character value which is suitable for creating another string based on op. This is always an approximation but more efficient @@ -442,7 +362,6 @@ static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op) { int kind; - assert(PyUnicode_IS_READY(op)); if (PyUnicode_IS_ASCII(op)) { return 0x7fU; } @@ -457,10 +376,8 @@ static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op) assert(kind == PyUnicode_4BYTE_KIND); return 0x10ffffU; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_MAX_CHAR_VALUE(op) \ - PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op)) -#endif +#define PyUnicode_MAX_CHAR_VALUE(op) \ + PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op)) /* === Public API ========================================================= */ @@ -474,31 +391,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_New( Py_UCS4 maxchar /* maximum code point value in the string */ ); -/* Initializes the canonical string representation from the deprecated - wstr/Py_UNICODE representation. This function is used to convert Unicode - objects which were created using the old API to the new flexible format - introduced with PEP 393. - - Don't call this function directly, use the public PyUnicode_READY() function - instead. */ -PyAPI_FUNC(int) _PyUnicode_Ready( - PyObject *unicode /* Unicode object */ - ); - -/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best - case. If the canonical representation is not yet set, it will still call - _PyUnicode_Ready(). - Returns 0 on success and -1 on errors. */ -static inline int PyUnicode_READY(PyObject *op) +/* For backward compatibility */ +static inline int PyUnicode_READY(PyObject* Py_UNUSED(op)) { - if (PyUnicode_IS_READY(op)) { - return 0; - } - return _PyUnicode_Ready(op); + return 0; } -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op)) -#endif +#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op)) /* Get a copy of a Unicode string. */ PyAPI_FUNC(PyObject*) _PyUnicode_Copy( @@ -586,139 +484,12 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( Py_ssize_t start, Py_ssize_t end); -/* --- Legacy deprecated API ---------------------------------------------- */ - -/* Create a Unicode Object from the Py_UNICODE buffer u of the given - size. - - u may be NULL which causes the contents to be undefined. It is the - user's responsibility to fill in the needed data afterwards. Note - that modifying the Unicode object contents after construction is - only allowed if u was set to NULL. - - The buffer is copied into the new object. */ -Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( - const Py_UNICODE *u, /* Unicode buffer */ - Py_ssize_t size /* size of buffer */ - ); - -/* Return a read-only pointer to the Unicode object's internal - Py_UNICODE buffer. - If the wchar_t/Py_UNICODE representation is not yet available, this - function will calculate it. */ -Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( - PyObject *unicode /* Unicode object */ - ); - -/* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string - contains null characters. */ -PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode( - PyObject *unicode /* Unicode object */ - ); - -/* Return a read-only pointer to the Unicode object's internal - Py_UNICODE buffer and save the length at size. - If the wchar_t/Py_UNICODE representation is not yet available, this - function will calculate it. */ - -Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( - PyObject *unicode, /* Unicode object */ - Py_ssize_t *size /* location where to save the length */ - ); - - -/* Fast access macros */ - -Py_DEPRECATED(3.3) -static inline Py_ssize_t PyUnicode_WSTR_LENGTH(PyObject *op) -{ - if (PyUnicode_IS_COMPACT_ASCII(op)) { - return _PyASCIIObject_CAST(op)->length; - } - else { - return _PyCompactUnicodeObject_CAST(op)->wstr_length; - } -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_WSTR_LENGTH(op) PyUnicode_WSTR_LENGTH(_PyObject_CAST(op)) -#endif - -/* Returns the deprecated Py_UNICODE representation's size in code units - (this includes surrogate pairs as 2 units). - If the Py_UNICODE representation is not available, it will be computed - on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ - -Py_DEPRECATED(3.3) -static inline Py_ssize_t PyUnicode_GET_SIZE(PyObject *op) -{ - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - if (_PyASCIIObject_CAST(op)->wstr == _Py_NULL) { - (void)PyUnicode_AsUnicode(op); - assert(_PyASCIIObject_CAST(op)->wstr != _Py_NULL); - } - return PyUnicode_WSTR_LENGTH(op); - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_GET_SIZE(op) PyUnicode_GET_SIZE(_PyObject_CAST(op)) -#endif - -Py_DEPRECATED(3.3) -static inline Py_ssize_t PyUnicode_GET_DATA_SIZE(PyObject *op) -{ - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - return PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE; - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_GET_DATA_SIZE(op) PyUnicode_GET_DATA_SIZE(_PyObject_CAST(op)) -#endif - -/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE - representation on demand. Using this macro is very inefficient now, - try to port your code to use the new PyUnicode_*BYTE_DATA() macros or - use PyUnicode_WRITE() and PyUnicode_READ(). */ - -Py_DEPRECATED(3.3) -static inline Py_UNICODE* PyUnicode_AS_UNICODE(PyObject *op) -{ - wchar_t *wstr = _PyASCIIObject_CAST(op)->wstr; - if (wstr != _Py_NULL) { - return wstr; - } - - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - return PyUnicode_AsUnicode(op); - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_AS_UNICODE(op) PyUnicode_AS_UNICODE(_PyObject_CAST(op)) -#endif - -Py_DEPRECATED(3.3) -static inline const char* PyUnicode_AS_DATA(PyObject *op) -{ - _Py_COMP_DIAG_PUSH - _Py_COMP_DIAG_IGNORE_DEPR_DECLS - Py_UNICODE *data = PyUnicode_AS_UNICODE(op); - // In C++, casting directly PyUnicode* to const char* is not valid - return _Py_STATIC_CAST(const char*, _Py_STATIC_CAST(const void*, data)); - _Py_COMP_DIAG_POP -} -#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000 -# define PyUnicode_AS_DATA(op) PyUnicode_AS_DATA(_PyObject_CAST(op)) -#endif - - /* --- _PyUnicodeWriter API ----------------------------------------------- */ typedef struct { PyObject *buffer; void *data; - enum PyUnicode_Kind kind; + int kind; Py_UCS4 maxchar; Py_ssize_t size; Py_ssize_t pos; @@ -769,8 +540,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Return 0 on success, raise an exception and return -1 on error. */ #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ - (assert((KIND) != PyUnicode_WCHAR_KIND), \ - (KIND) <= (WRITER)->kind \ + ((KIND) <= (WRITER)->kind \ ? 0 \ : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) @@ -778,7 +548,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, macro instead. */ PyAPI_FUNC(int) _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, - enum PyUnicode_Kind kind); + int kind); /* Append a Unicode character. Return 0 on success, raise an exception and return -1 on error. */ @@ -1024,10 +794,6 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( /* === Characters Type APIs =============================================== */ -/* Helper array used by Py_UNICODE_ISSPACE(). */ - -PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; - /* These should not be used directly. Use the Py_UNICODE_IS* and Py_UNICODE_TO* macros instead. @@ -1135,6 +901,50 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +// Helper array used by Py_UNICODE_ISSPACE(). +PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; + +// Since splitting on whitespace is an important use case, and +// whitespace in most situations is solely ASCII whitespace, we +// optimize for the common case by using a quick look-up table +// _Py_ascii_whitespace (see below) with an inlined check. +static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { + if (ch < 128) { + return _Py_ascii_whitespace[ch]; + } + return _PyUnicode_IsWhitespace(ch); +} + +#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) +#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) +#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) +#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) + +#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) +#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) +#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) + +#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) +#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) +#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) +#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) + +#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) +#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) +#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) + +#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) + +static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { + return (Py_UNICODE_ISALPHA(ch) + || Py_UNICODE_ISDECIMAL(ch) + || Py_UNICODE_ISDIGIT(ch) + || Py_UNICODE_ISNUMERIC(ch)); +} + + +/* === Misc functions ===================================================== */ + PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ @@ -1144,7 +954,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); and where the hash values are equal (i.e. a very probable match) */ PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); -/* Equality check. Returns -1 on failure. */ +/* Equality check. */ PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *); PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *); -- cgit v1.3