diff options
author | orivej <orivej@yandex-team.ru> | 2022-02-10 16:45:01 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:01 +0300 |
commit | 2d37894b1b037cf24231090eda8589bbb44fb6fc (patch) | |
tree | be835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/tools/python3/src/Objects/stringlib | |
parent | 718c552901d703c502ccbefdfc3c9028d608b947 (diff) | |
download | ydb-2d37894b1b037cf24231090eda8589bbb44fb6fc.tar.gz |
Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/tools/python3/src/Objects/stringlib')
21 files changed, 4334 insertions, 4334 deletions
diff --git a/contrib/tools/python3/src/Objects/stringlib/asciilib.h b/contrib/tools/python3/src/Objects/stringlib/asciilib.h index 90ba2aa015..e69a2c076e 100644 --- a/contrib/tools/python3/src/Objects/stringlib/asciilib.h +++ b/contrib/tools/python3/src/Objects/stringlib/asciilib.h @@ -1,26 +1,26 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH asciilib_fastsearch -#define STRINGLIB(F) asciilib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 1 -#define STRINGLIB_MAX_CHAR 0x7Fu -#define STRINGLIB_CHAR Py_UCS1 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_1BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define FASTSEARCH asciilib_fastsearch +#define STRINGLIB(F) asciilib_##F +#define STRINGLIB_OBJECT PyUnicodeObject +#define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_MAX_CHAR 0x7Fu +#define STRINGLIB_CHAR Py_UCS1 +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_PARSE_CODE "U" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_STR PyUnicode_1BYTE_DATA +#define STRINGLIB_LEN PyUnicode_GET_LENGTH #define STRINGLIB_NEW(STR,LEN) _PyUnicode_FromASCII((const char*)(STR),(LEN)) -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII +#define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact + +#define STRINGLIB_TOSTR PyObject_Str +#define STRINGLIB_TOASCII PyObject_ASCII diff --git a/contrib/tools/python3/src/Objects/stringlib/codecs.h b/contrib/tools/python3/src/Objects/stringlib/codecs.h index dc2f8d2967..9b2a29ba3b 100644 --- a/contrib/tools/python3/src/Objects/stringlib/codecs.h +++ b/contrib/tools/python3/src/Objects/stringlib/codecs.h @@ -1,825 +1,825 @@ -/* stringlib: codec implementations */ - -#if !STRINGLIB_IS_UNICODE -# error "codecs.h is specific to Unicode" -#endif - +/* stringlib: codec implementations */ + +#if !STRINGLIB_IS_UNICODE +# error "codecs.h is specific to Unicode" +#endif + #include "pycore_byteswap.h" // _Py_bswap32() -/* Mask to quickly check whether a C 'long' contains a - non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_LONG == 8) -# define ASCII_CHAR_MASK 0x8080808080808080UL -#elif (SIZEOF_LONG == 4) -# define ASCII_CHAR_MASK 0x80808080UL -#else -# error C 'long' size should be either 4 or 8! -#endif - -/* 10xxxxxx */ -#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(utf8_decode)(const char **inptr, const char *end, - STRINGLIB_CHAR *dest, - Py_ssize_t *outpos) -{ - Py_UCS4 ch; - const char *s = *inptr; - const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); - STRINGLIB_CHAR *p = dest + *outpos; - - while (s < end) { - ch = (unsigned char)*s; - - if (ch < 0x80) { - /* Fast path for runs of ASCII characters. Given that common UTF-8 - input will consist of an overwhelming majority of ASCII - characters, we try to optimize for this case by checking - as many characters as a C 'long' can contain. - First, check if we can do an aligned read, as most CPUs have - a penalty for unaligned reads. - */ - if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { - /* Help register allocation */ - const char *_s = s; - STRINGLIB_CHAR *_p = p; - while (_s < aligned_end) { - /* Read a whole long at a time (either 4 or 8 bytes), - and do a fast unrolled copy if it only contains ASCII - characters. */ +/* Mask to quickly check whether a C 'long' contains a + non-ASCII, UTF8-encoded char. */ +#if (SIZEOF_LONG == 8) +# define ASCII_CHAR_MASK 0x8080808080808080UL +#elif (SIZEOF_LONG == 4) +# define ASCII_CHAR_MASK 0x80808080UL +#else +# error C 'long' size should be either 4 or 8! +#endif + +/* 10xxxxxx */ +#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) + +Py_LOCAL_INLINE(Py_UCS4) +STRINGLIB(utf8_decode)(const char **inptr, const char *end, + STRINGLIB_CHAR *dest, + Py_ssize_t *outpos) +{ + Py_UCS4 ch; + const char *s = *inptr; + const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); + STRINGLIB_CHAR *p = dest + *outpos; + + while (s < end) { + ch = (unsigned char)*s; + + if (ch < 0x80) { + /* Fast path for runs of ASCII characters. Given that common UTF-8 + input will consist of an overwhelming majority of ASCII + characters, we try to optimize for this case by checking + as many characters as a C 'long' can contain. + First, check if we can do an aligned read, as most CPUs have + a penalty for unaligned reads. + */ + if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { + /* Help register allocation */ + const char *_s = s; + STRINGLIB_CHAR *_p = p; + while (_s < aligned_end) { + /* Read a whole long at a time (either 4 or 8 bytes), + and do a fast unrolled copy if it only contains ASCII + characters. */ unsigned long value = *(const unsigned long *) _s; - if (value & ASCII_CHAR_MASK) - break; -#if PY_LITTLE_ENDIAN - _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); - _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); - _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); - _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); -# if SIZEOF_LONG == 8 - _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); - _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); - _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); - _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); -# endif -#else -# if SIZEOF_LONG == 8 - _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); - _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); - _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); - _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); - _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); - _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); - _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); - _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); -# else - _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); - _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); - _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); - _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); -# endif -#endif - _s += SIZEOF_LONG; - _p += SIZEOF_LONG; - } - s = _s; - p = _p; - if (s == end) - break; - ch = (unsigned char)*s; - } - if (ch < 0x80) { - s++; - *p++ = ch; - continue; - } - } - - if (ch < 0xE0) { - /* \xC2\x80-\xDF\xBF -- 0080-07FF */ - Py_UCS4 ch2; - if (ch < 0xC2) { - /* invalid sequence - \x80-\xBF -- continuation byte - \xC0-\xC1 -- fake 0000-007F */ - goto InvalidStart; - } - if (end - s < 2) { - /* unexpected end of data: the caller will decide whether - it's an error or not */ - break; - } - ch2 = (unsigned char)s[1]; - if (!IS_CONTINUATION_BYTE(ch2)) - /* invalid continuation byte */ - goto InvalidContinuation1; - ch = (ch << 6) + ch2 - - ((0xC0 << 6) + 0x80); - assert ((ch > 0x007F) && (ch <= 0x07FF)); - s += 2; - if (STRINGLIB_MAX_CHAR <= 0x007F || - (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) - /* Out-of-range */ - goto Return; - *p++ = ch; - continue; - } - - if (ch < 0xF0) { - /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ - Py_UCS4 ch2, ch3; - if (end - s < 3) { - /* unexpected end of data: the caller will decide whether - it's an error or not */ - if (end - s < 2) - break; - ch2 = (unsigned char)s[1]; - if (!IS_CONTINUATION_BYTE(ch2) || - (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) - /* for clarification see comments below */ - goto InvalidContinuation1; - break; - } - ch2 = (unsigned char)s[1]; - ch3 = (unsigned char)s[2]; - if (!IS_CONTINUATION_BYTE(ch2)) { - /* invalid continuation byte */ - goto InvalidContinuation1; - } - if (ch == 0xE0) { - if (ch2 < 0xA0) - /* invalid sequence - \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ - goto InvalidContinuation1; - } else if (ch == 0xED && ch2 >= 0xA0) { - /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF - will result in surrogates in range D800-DFFF. Surrogates are - not valid UTF-8 so they are rejected. + if (value & ASCII_CHAR_MASK) + break; +#if PY_LITTLE_ENDIAN + _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); + _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); + _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); + _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); +# if SIZEOF_LONG == 8 + _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); + _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); + _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); + _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); +# endif +#else +# if SIZEOF_LONG == 8 + _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); + _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); + _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); + _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); + _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); + _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); + _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); + _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); +# else + _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); + _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); + _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); + _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); +# endif +#endif + _s += SIZEOF_LONG; + _p += SIZEOF_LONG; + } + s = _s; + p = _p; + if (s == end) + break; + ch = (unsigned char)*s; + } + if (ch < 0x80) { + s++; + *p++ = ch; + continue; + } + } + + if (ch < 0xE0) { + /* \xC2\x80-\xDF\xBF -- 0080-07FF */ + Py_UCS4 ch2; + if (ch < 0xC2) { + /* invalid sequence + \x80-\xBF -- continuation byte + \xC0-\xC1 -- fake 0000-007F */ + goto InvalidStart; + } + if (end - s < 2) { + /* unexpected end of data: the caller will decide whether + it's an error or not */ + break; + } + ch2 = (unsigned char)s[1]; + if (!IS_CONTINUATION_BYTE(ch2)) + /* invalid continuation byte */ + goto InvalidContinuation1; + ch = (ch << 6) + ch2 - + ((0xC0 << 6) + 0x80); + assert ((ch > 0x007F) && (ch <= 0x07FF)); + s += 2; + if (STRINGLIB_MAX_CHAR <= 0x007F || + (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) + /* Out-of-range */ + goto Return; + *p++ = ch; + continue; + } + + if (ch < 0xF0) { + /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ + Py_UCS4 ch2, ch3; + if (end - s < 3) { + /* unexpected end of data: the caller will decide whether + it's an error or not */ + if (end - s < 2) + break; + ch2 = (unsigned char)s[1]; + if (!IS_CONTINUATION_BYTE(ch2) || + (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) + /* for clarification see comments below */ + goto InvalidContinuation1; + break; + } + ch2 = (unsigned char)s[1]; + ch3 = (unsigned char)s[2]; + if (!IS_CONTINUATION_BYTE(ch2)) { + /* invalid continuation byte */ + goto InvalidContinuation1; + } + if (ch == 0xE0) { + if (ch2 < 0xA0) + /* invalid sequence + \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ + goto InvalidContinuation1; + } else if (ch == 0xED && ch2 >= 0xA0) { + /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF + will result in surrogates in range D800-DFFF. Surrogates are + not valid UTF-8 so they are rejected. See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf - (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ - goto InvalidContinuation1; - } - if (!IS_CONTINUATION_BYTE(ch3)) { - /* invalid continuation byte */ - goto InvalidContinuation2; - } - ch = (ch << 12) + (ch2 << 6) + ch3 - - ((0xE0 << 12) + (0x80 << 6) + 0x80); - assert ((ch > 0x07FF) && (ch <= 0xFFFF)); - s += 3; - if (STRINGLIB_MAX_CHAR <= 0x07FF || - (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) - /* Out-of-range */ - goto Return; - *p++ = ch; - continue; - } - - if (ch < 0xF5) { - /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ - Py_UCS4 ch2, ch3, ch4; - if (end - s < 4) { - /* unexpected end of data: the caller will decide whether - it's an error or not */ - if (end - s < 2) - break; - ch2 = (unsigned char)s[1]; - if (!IS_CONTINUATION_BYTE(ch2) || - (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) - /* for clarification see comments below */ - goto InvalidContinuation1; - if (end - s < 3) - break; - ch3 = (unsigned char)s[2]; - if (!IS_CONTINUATION_BYTE(ch3)) - goto InvalidContinuation2; - break; - } - ch2 = (unsigned char)s[1]; - ch3 = (unsigned char)s[2]; - ch4 = (unsigned char)s[3]; - if (!IS_CONTINUATION_BYTE(ch2)) { - /* invalid continuation byte */ - goto InvalidContinuation1; - } - if (ch == 0xF0) { - if (ch2 < 0x90) - /* invalid sequence - \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ - goto InvalidContinuation1; - } else if (ch == 0xF4 && ch2 >= 0x90) { - /* invalid sequence + (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ + goto InvalidContinuation1; + } + if (!IS_CONTINUATION_BYTE(ch3)) { + /* invalid continuation byte */ + goto InvalidContinuation2; + } + ch = (ch << 12) + (ch2 << 6) + ch3 - + ((0xE0 << 12) + (0x80 << 6) + 0x80); + assert ((ch > 0x07FF) && (ch <= 0xFFFF)); + s += 3; + if (STRINGLIB_MAX_CHAR <= 0x07FF || + (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) + /* Out-of-range */ + goto Return; + *p++ = ch; + continue; + } + + if (ch < 0xF5) { + /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ + Py_UCS4 ch2, ch3, ch4; + if (end - s < 4) { + /* unexpected end of data: the caller will decide whether + it's an error or not */ + if (end - s < 2) + break; + ch2 = (unsigned char)s[1]; + if (!IS_CONTINUATION_BYTE(ch2) || + (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) + /* for clarification see comments below */ + goto InvalidContinuation1; + if (end - s < 3) + break; + ch3 = (unsigned char)s[2]; + if (!IS_CONTINUATION_BYTE(ch3)) + goto InvalidContinuation2; + break; + } + ch2 = (unsigned char)s[1]; + ch3 = (unsigned char)s[2]; + ch4 = (unsigned char)s[3]; + if (!IS_CONTINUATION_BYTE(ch2)) { + /* invalid continuation byte */ + goto InvalidContinuation1; + } + if (ch == 0xF0) { + if (ch2 < 0x90) + /* invalid sequence + \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ + goto InvalidContinuation1; + } else if (ch == 0xF4 && ch2 >= 0x90) { + /* invalid sequence \xF4\x90\x80\x80- -- 110000- overflow */ - goto InvalidContinuation1; - } - if (!IS_CONTINUATION_BYTE(ch3)) { - /* invalid continuation byte */ - goto InvalidContinuation2; - } - if (!IS_CONTINUATION_BYTE(ch4)) { - /* invalid continuation byte */ - goto InvalidContinuation3; - } - ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - - ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); - assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); - s += 4; - if (STRINGLIB_MAX_CHAR <= 0xFFFF || - (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) - /* Out-of-range */ - goto Return; - *p++ = ch; - continue; - } - goto InvalidStart; - } - ch = 0; -Return: - *inptr = s; - *outpos = p - dest; - return ch; -InvalidStart: - ch = 1; - goto Return; -InvalidContinuation1: - ch = 2; - goto Return; -InvalidContinuation2: - ch = 3; - goto Return; -InvalidContinuation3: - ch = 4; - goto Return; -} - -#undef ASCII_CHAR_MASK - - -/* UTF-8 encoder specialized for a Unicode kind to avoid the slow - PyUnicode_READ() macro. Delete some parts of the code depending on the kind: - UCS-1 strings don't need to handle surrogates for example. */ + goto InvalidContinuation1; + } + if (!IS_CONTINUATION_BYTE(ch3)) { + /* invalid continuation byte */ + goto InvalidContinuation2; + } + if (!IS_CONTINUATION_BYTE(ch4)) { + /* invalid continuation byte */ + goto InvalidContinuation3; + } + ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - + ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); + assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); + s += 4; + if (STRINGLIB_MAX_CHAR <= 0xFFFF || + (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) + /* Out-of-range */ + goto Return; + *p++ = ch; + continue; + } + goto InvalidStart; + } + ch = 0; +Return: + *inptr = s; + *outpos = p - dest; + return ch; +InvalidStart: + ch = 1; + goto Return; +InvalidContinuation1: + ch = 2; + goto Return; +InvalidContinuation2: + ch = 3; + goto Return; +InvalidContinuation3: + ch = 4; + goto Return; +} + +#undef ASCII_CHAR_MASK + + +/* UTF-8 encoder specialized for a Unicode kind to avoid the slow + PyUnicode_READ() macro. Delete some parts of the code depending on the kind: + UCS-1 strings don't need to handle surrogates for example. */ Py_LOCAL_INLINE(char *) STRINGLIB(utf8_encoder)(_PyBytesWriter *writer, PyObject *unicode, const STRINGLIB_CHAR *data, - Py_ssize_t size, + Py_ssize_t size, _Py_error_handler error_handler, - const char *errors) -{ - Py_ssize_t i; /* index into data of next input character */ - char *p; /* next free byte in output buffer */ -#if STRINGLIB_SIZEOF_CHAR > 1 - PyObject *error_handler_obj = NULL; - PyObject *exc = NULL; - PyObject *rep = NULL; -#endif -#if STRINGLIB_SIZEOF_CHAR == 1 - const Py_ssize_t max_char_size = 2; -#elif STRINGLIB_SIZEOF_CHAR == 2 - const Py_ssize_t max_char_size = 3; -#else /* STRINGLIB_SIZEOF_CHAR == 4 */ - const Py_ssize_t max_char_size = 4; -#endif - - assert(size >= 0); - if (size > PY_SSIZE_T_MAX / max_char_size) { - /* integer overflow */ + const char *errors) +{ + Py_ssize_t i; /* index into data of next input character */ + char *p; /* next free byte in output buffer */ +#if STRINGLIB_SIZEOF_CHAR > 1 + PyObject *error_handler_obj = NULL; + PyObject *exc = NULL; + PyObject *rep = NULL; +#endif +#if STRINGLIB_SIZEOF_CHAR == 1 + const Py_ssize_t max_char_size = 2; +#elif STRINGLIB_SIZEOF_CHAR == 2 + const Py_ssize_t max_char_size = 3; +#else /* STRINGLIB_SIZEOF_CHAR == 4 */ + const Py_ssize_t max_char_size = 4; +#endif + + assert(size >= 0); + if (size > PY_SSIZE_T_MAX / max_char_size) { + /* integer overflow */ PyErr_NoMemory(); return NULL; - } - + } + _PyBytesWriter_Init(writer); p = _PyBytesWriter_Alloc(writer, size * max_char_size); - if (p == NULL) - return NULL; - - for (i = 0; i < size;) { - Py_UCS4 ch = data[i++]; - - if (ch < 0x80) { - /* Encode ASCII */ - *p++ = (char) ch; - - } - else -#if STRINGLIB_SIZEOF_CHAR > 1 - if (ch < 0x0800) -#endif - { - /* Encode Latin-1 */ - *p++ = (char)(0xc0 | (ch >> 6)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } -#if STRINGLIB_SIZEOF_CHAR > 1 - else if (Py_UNICODE_IS_SURROGATE(ch)) { - Py_ssize_t startpos, endpos, newpos; - Py_ssize_t k; - if (error_handler == _Py_ERROR_UNKNOWN) { + if (p == NULL) + return NULL; + + for (i = 0; i < size;) { + Py_UCS4 ch = data[i++]; + + if (ch < 0x80) { + /* Encode ASCII */ + *p++ = (char) ch; + + } + else +#if STRINGLIB_SIZEOF_CHAR > 1 + if (ch < 0x0800) +#endif + { + /* Encode Latin-1 */ + *p++ = (char)(0xc0 | (ch >> 6)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } +#if STRINGLIB_SIZEOF_CHAR > 1 + else if (Py_UNICODE_IS_SURROGATE(ch)) { + Py_ssize_t startpos, endpos, newpos; + Py_ssize_t k; + if (error_handler == _Py_ERROR_UNKNOWN) { error_handler = _Py_GetErrorHandler(errors); - } - - startpos = i-1; - endpos = startpos+1; - - while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) - endpos++; - - /* Only overallocate the buffer if it's not the last write */ + } + + startpos = i-1; + endpos = startpos+1; + + while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) + endpos++; + + /* Only overallocate the buffer if it's not the last write */ writer->overallocate = (endpos < size); - - switch (error_handler) - { - case _Py_ERROR_REPLACE: - memset(p, '?', endpos - startpos); - p += (endpos - startpos); - /* fall through */ - case _Py_ERROR_IGNORE: - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_SURROGATEPASS: - for (k=startpos; k<endpos; k++) { - ch = data[k]; - *p++ = (char)(0xe0 | (ch >> 12)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_BACKSLASHREPLACE: - /* subtract preallocated bytes */ + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + memset(p, '?', endpos - startpos); + p += (endpos - startpos); + /* fall through */ + case _Py_ERROR_IGNORE: + i += (endpos - startpos - 1); + break; + + case _Py_ERROR_SURROGATEPASS: + for (k=startpos; k<endpos; k++) { + ch = data[k]; + *p++ = (char)(0xe0 | (ch >> 12)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + i += (endpos - startpos - 1); + break; + + case _Py_ERROR_BACKSLASHREPLACE: + /* subtract preallocated bytes */ writer->min_size -= max_char_size * (endpos - startpos); p = backslashreplace(writer, p, - unicode, startpos, endpos); - if (p == NULL) - goto error; - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_XMLCHARREFREPLACE: - /* subtract preallocated bytes */ + unicode, startpos, endpos); + if (p == NULL) + goto error; + i += (endpos - startpos - 1); + break; + + case _Py_ERROR_XMLCHARREFREPLACE: + /* subtract preallocated bytes */ writer->min_size -= max_char_size * (endpos - startpos); p = xmlcharrefreplace(writer, p, - unicode, startpos, endpos); - if (p == NULL) - goto error; - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_SURROGATEESCAPE: - for (k=startpos; k<endpos; k++) { - ch = data[k]; - if (!(0xDC80 <= ch && ch <= 0xDCFF)) - break; - *p++ = (char)(ch & 0xff); - } - if (k >= endpos) { - i += (endpos - startpos - 1); - break; - } - startpos = k; - assert(startpos < endpos); - /* fall through */ - default: - rep = unicode_encode_call_errorhandler( - errors, &error_handler_obj, "utf-8", "surrogates not allowed", - unicode, &exc, startpos, endpos, &newpos); - if (!rep) - goto error; - - /* subtract preallocated bytes */ + unicode, startpos, endpos); + if (p == NULL) + goto error; + i += (endpos - startpos - 1); + break; + + case _Py_ERROR_SURROGATEESCAPE: + for (k=startpos; k<endpos; k++) { + ch = data[k]; + if (!(0xDC80 <= ch && ch <= 0xDCFF)) + break; + *p++ = (char)(ch & 0xff); + } + if (k >= endpos) { + i += (endpos - startpos - 1); + break; + } + startpos = k; + assert(startpos < endpos); + /* fall through */ + default: + rep = unicode_encode_call_errorhandler( + errors, &error_handler_obj, "utf-8", "surrogates not allowed", + unicode, &exc, startpos, endpos, &newpos); + if (!rep) + goto error; + + /* subtract preallocated bytes */ writer->min_size -= max_char_size * (newpos - startpos); - - if (PyBytes_Check(rep)) { + + if (PyBytes_Check(rep)) { p = _PyBytesWriter_WriteBytes(writer, p, - PyBytes_AS_STRING(rep), - PyBytes_GET_SIZE(rep)); - } - else { - /* rep is unicode */ - if (PyUnicode_READY(rep) < 0) - goto error; - - if (!PyUnicode_IS_ASCII(rep)) { - raise_encode_exception(&exc, "utf-8", unicode, - startpos, endpos, - "surrogates not allowed"); - goto error; - } - + PyBytes_AS_STRING(rep), + PyBytes_GET_SIZE(rep)); + } + else { + /* rep is unicode */ + if (PyUnicode_READY(rep) < 0) + goto error; + + if (!PyUnicode_IS_ASCII(rep)) { + raise_encode_exception(&exc, "utf-8", unicode, + startpos, endpos, + "surrogates not allowed"); + goto error; + } + p = _PyBytesWriter_WriteBytes(writer, p, - PyUnicode_DATA(rep), - PyUnicode_GET_LENGTH(rep)); - } - - if (p == NULL) - goto error; - Py_CLEAR(rep); - - i = newpos; - } - - /* If overallocation was disabled, ensure that it was the last - write. Otherwise, we missed an optimization */ + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); + } + + if (p == NULL) + goto error; + Py_CLEAR(rep); + + i = newpos; + } + + /* If overallocation was disabled, ensure that it was the last + write. Otherwise, we missed an optimization */ assert(writer->overallocate || i == size); - } - else -#if STRINGLIB_SIZEOF_CHAR > 2 - if (ch < 0x10000) -#endif - { - *p++ = (char)(0xe0 | (ch >> 12)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } -#if STRINGLIB_SIZEOF_CHAR > 2 - else /* ch >= 0x10000 */ - { - assert(ch <= MAX_UNICODE); - /* Encode UCS4 Unicode ordinals */ - *p++ = (char)(0xf0 | (ch >> 18)); - *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } -#endif /* STRINGLIB_SIZEOF_CHAR > 2 */ -#endif /* STRINGLIB_SIZEOF_CHAR > 1 */ - } - -#if STRINGLIB_SIZEOF_CHAR > 1 - Py_XDECREF(error_handler_obj); - Py_XDECREF(exc); -#endif + } + else +#if STRINGLIB_SIZEOF_CHAR > 2 + if (ch < 0x10000) +#endif + { + *p++ = (char)(0xe0 | (ch >> 12)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } +#if STRINGLIB_SIZEOF_CHAR > 2 + else /* ch >= 0x10000 */ + { + assert(ch <= MAX_UNICODE); + /* Encode UCS4 Unicode ordinals */ + *p++ = (char)(0xf0 | (ch >> 18)); + *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } +#endif /* STRINGLIB_SIZEOF_CHAR > 2 */ +#endif /* STRINGLIB_SIZEOF_CHAR > 1 */ + } + +#if STRINGLIB_SIZEOF_CHAR > 1 + Py_XDECREF(error_handler_obj); + Py_XDECREF(exc); +#endif return p; - -#if STRINGLIB_SIZEOF_CHAR > 1 - error: - Py_XDECREF(rep); - Py_XDECREF(error_handler_obj); - Py_XDECREF(exc); - return NULL; -#endif -} - -/* The pattern for constructing UCS2-repeated masks. */ -#if SIZEOF_LONG == 8 -# define UCS2_REPEAT_MASK 0x0001000100010001ul -#elif SIZEOF_LONG == 4 -# define UCS2_REPEAT_MASK 0x00010001ul -#else -# error C 'long' size should be either 4 or 8! -#endif - -/* The mask for fast checking. */ -#if STRINGLIB_SIZEOF_CHAR == 1 -/* The mask for fast checking of whether a C 'long' contains a - non-ASCII or non-Latin1 UTF16-encoded characters. */ -# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) -#else -/* The mask for fast checking of whether a C 'long' may contain - UTF16-encoded surrogate characters. This is an efficient heuristic, - assuming that non-surrogate characters with a code point >= 0x8000 are - rare in most input. -*/ -# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) -#endif -/* The mask for fast byte-swapping. */ -#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) -/* Swap bytes. */ -#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ - (((value) & STRIPPED_MASK) << 8)) - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, - STRINGLIB_CHAR *dest, Py_ssize_t *outpos, - int native_ordering) -{ - Py_UCS4 ch; - const unsigned char *aligned_end = - (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG); - const unsigned char *q = *inptr; - STRINGLIB_CHAR *p = dest + *outpos; - /* Offsets from q for retrieving byte pairs in the right order. */ -#if PY_LITTLE_ENDIAN - int ihi = !!native_ordering, ilo = !native_ordering; -#else - int ihi = !native_ordering, ilo = !!native_ordering; -#endif - --e; - - while (q < e) { - Py_UCS4 ch2; - /* First check for possible aligned read of a C 'long'. Unaligned - reads are more expensive, better to defer to another iteration. */ - if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) { - /* Fast path for runs of in-range non-surrogate chars. */ - const unsigned char *_q = q; - while (_q < aligned_end) { + +#if STRINGLIB_SIZEOF_CHAR > 1 + error: + Py_XDECREF(rep); + Py_XDECREF(error_handler_obj); + Py_XDECREF(exc); + return NULL; +#endif +} + +/* The pattern for constructing UCS2-repeated masks. */ +#if SIZEOF_LONG == 8 +# define UCS2_REPEAT_MASK 0x0001000100010001ul +#elif SIZEOF_LONG == 4 +# define UCS2_REPEAT_MASK 0x00010001ul +#else +# error C 'long' size should be either 4 or 8! +#endif + +/* The mask for fast checking. */ +#if STRINGLIB_SIZEOF_CHAR == 1 +/* The mask for fast checking of whether a C 'long' contains a + non-ASCII or non-Latin1 UTF16-encoded characters. */ +# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) +#else +/* The mask for fast checking of whether a C 'long' may contain + UTF16-encoded surrogate characters. This is an efficient heuristic, + assuming that non-surrogate characters with a code point >= 0x8000 are + rare in most input. +*/ +# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) +#endif +/* The mask for fast byte-swapping. */ +#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) +/* Swap bytes. */ +#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ + (((value) & STRIPPED_MASK) << 8)) + +Py_LOCAL_INLINE(Py_UCS4) +STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, + STRINGLIB_CHAR *dest, Py_ssize_t *outpos, + int native_ordering) +{ + Py_UCS4 ch; + const unsigned char *aligned_end = + (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG); + const unsigned char *q = *inptr; + STRINGLIB_CHAR *p = dest + *outpos; + /* Offsets from q for retrieving byte pairs in the right order. */ +#if PY_LITTLE_ENDIAN + int ihi = !!native_ordering, ilo = !native_ordering; +#else + int ihi = !native_ordering, ilo = !!native_ordering; +#endif + --e; + + while (q < e) { + Py_UCS4 ch2; + /* First check for possible aligned read of a C 'long'. Unaligned + reads are more expensive, better to defer to another iteration. */ + if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) { + /* Fast path for runs of in-range non-surrogate chars. */ + const unsigned char *_q = q; + while (_q < aligned_end) { unsigned long block = * (const unsigned long *) _q; - if (native_ordering) { - /* Can use buffer directly */ - if (block & FAST_CHAR_MASK) - break; - } - else { - /* Need to byte-swap */ - if (block & SWAB(FAST_CHAR_MASK)) - break; -#if STRINGLIB_SIZEOF_CHAR == 1 - block >>= 8; -#else - block = SWAB(block); -#endif - } -#if PY_LITTLE_ENDIAN -# if SIZEOF_LONG == 4 - p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); - p[1] = (STRINGLIB_CHAR)(block >> 16); -# elif SIZEOF_LONG == 8 - p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); - p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); - p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); - p[3] = (STRINGLIB_CHAR)(block >> 48); -# endif -#else -# if SIZEOF_LONG == 4 - p[0] = (STRINGLIB_CHAR)(block >> 16); - p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); -# elif SIZEOF_LONG == 8 - p[0] = (STRINGLIB_CHAR)(block >> 48); - p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); - p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); - p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); -# endif -#endif - _q += SIZEOF_LONG; - p += SIZEOF_LONG / 2; - } - q = _q; - if (q >= e) - break; - } - - ch = (q[ihi] << 8) | q[ilo]; - q += 2; - if (!Py_UNICODE_IS_SURROGATE(ch)) { -#if STRINGLIB_SIZEOF_CHAR < 2 - if (ch > STRINGLIB_MAX_CHAR) - /* Out-of-range */ - goto Return; -#endif - *p++ = (STRINGLIB_CHAR)ch; - continue; - } - - /* UTF-16 code pair: */ + if (native_ordering) { + /* Can use buffer directly */ + if (block & FAST_CHAR_MASK) + break; + } + else { + /* Need to byte-swap */ + if (block & SWAB(FAST_CHAR_MASK)) + break; +#if STRINGLIB_SIZEOF_CHAR == 1 + block >>= 8; +#else + block = SWAB(block); +#endif + } +#if PY_LITTLE_ENDIAN +# if SIZEOF_LONG == 4 + p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); + p[1] = (STRINGLIB_CHAR)(block >> 16); +# elif SIZEOF_LONG == 8 + p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); + p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); + p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); + p[3] = (STRINGLIB_CHAR)(block >> 48); +# endif +#else +# if SIZEOF_LONG == 4 + p[0] = (STRINGLIB_CHAR)(block >> 16); + p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); +# elif SIZEOF_LONG == 8 + p[0] = (STRINGLIB_CHAR)(block >> 48); + p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); + p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); + p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); +# endif +#endif + _q += SIZEOF_LONG; + p += SIZEOF_LONG / 2; + } + q = _q; + if (q >= e) + break; + } + + ch = (q[ihi] << 8) | q[ilo]; + q += 2; + if (!Py_UNICODE_IS_SURROGATE(ch)) { +#if STRINGLIB_SIZEOF_CHAR < 2 + if (ch > STRINGLIB_MAX_CHAR) + /* Out-of-range */ + goto Return; +#endif + *p++ = (STRINGLIB_CHAR)ch; + continue; + } + + /* UTF-16 code pair: */ if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) goto IllegalEncoding; - if (q >= e) - goto UnexpectedEnd; - ch2 = (q[ihi] << 8) | q[ilo]; - q += 2; - if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) - goto IllegalSurrogate; - ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); -#if STRINGLIB_SIZEOF_CHAR < 4 - /* Out-of-range */ - goto Return; -#else - *p++ = (STRINGLIB_CHAR)ch; -#endif - } - ch = 0; -Return: - *inptr = q; - *outpos = p - dest; - return ch; -UnexpectedEnd: - ch = 1; - goto Return; -IllegalEncoding: - ch = 2; - goto Return; -IllegalSurrogate: - ch = 3; - goto Return; -} -#undef UCS2_REPEAT_MASK -#undef FAST_CHAR_MASK -#undef STRIPPED_MASK -#undef SWAB - - -#if STRINGLIB_MAX_CHAR >= 0x80 -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, - Py_ssize_t len, - unsigned short **outptr, - int native_ordering) -{ - unsigned short *out = *outptr; - const STRINGLIB_CHAR *end = in + len; -#if STRINGLIB_SIZEOF_CHAR == 1 - if (native_ordering) { - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - out[0] = in[0]; - out[1] = in[1]; - out[2] = in[2]; - out[3] = in[3]; - in += 4; out += 4; - } - while (in < end) { - *out++ = *in++; - } - } else { -# define SWAB2(CH) ((CH) << 8) /* high byte is zero */ - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - out[0] = SWAB2(in[0]); - out[1] = SWAB2(in[1]); - out[2] = SWAB2(in[2]); - out[3] = SWAB2(in[3]); - in += 4; out += 4; - } - while (in < end) { - Py_UCS4 ch = *in++; - *out++ = SWAB2((Py_UCS2)ch); - } -#undef SWAB2 - } - *outptr = out; - return len; -#else - if (native_ordering) { -#if STRINGLIB_MAX_CHAR < 0x10000 - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; - out[0] = in[0]; - out[1] = in[1]; - out[2] = in[2]; - out[3] = in[3]; - in += 4; out += 4; - } -#endif - while (in < end) { - Py_UCS4 ch; - ch = *in++; - if (ch < 0xd800) - *out++ = ch; - else if (ch < 0xe000) - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; -#if STRINGLIB_MAX_CHAR >= 0x10000 - else if (ch >= 0x10000) { - out[0] = Py_UNICODE_HIGH_SURROGATE(ch); - out[1] = Py_UNICODE_LOW_SURROGATE(ch); - out += 2; - } -#endif - else - *out++ = ch; - } - } else { -#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) -#if STRINGLIB_MAX_CHAR < 0x10000 - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; - out[0] = SWAB2(in[0]); - out[1] = SWAB2(in[1]); - out[2] = SWAB2(in[2]); - out[3] = SWAB2(in[3]); - in += 4; out += 4; - } -#endif - while (in < end) { - Py_UCS4 ch = *in++; - if (ch < 0xd800) - *out++ = SWAB2((Py_UCS2)ch); - else if (ch < 0xe000) - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; -#if STRINGLIB_MAX_CHAR >= 0x10000 - else if (ch >= 0x10000) { - Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); - Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); - out[0] = SWAB2(ch1); - out[1] = SWAB2(ch2); - out += 2; - } -#endif - else - *out++ = SWAB2((Py_UCS2)ch); - } -#undef SWAB2 - } - *outptr = out; - return len; - fail: - *outptr = out; - return len - (end - in + 1); -#endif -} - + if (q >= e) + goto UnexpectedEnd; + ch2 = (q[ihi] << 8) | q[ilo]; + q += 2; + if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) + goto IllegalSurrogate; + ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); +#if STRINGLIB_SIZEOF_CHAR < 4 + /* Out-of-range */ + goto Return; +#else + *p++ = (STRINGLIB_CHAR)ch; +#endif + } + ch = 0; +Return: + *inptr = q; + *outpos = p - dest; + return ch; +UnexpectedEnd: + ch = 1; + goto Return; +IllegalEncoding: + ch = 2; + goto Return; +IllegalSurrogate: + ch = 3; + goto Return; +} +#undef UCS2_REPEAT_MASK +#undef FAST_CHAR_MASK +#undef STRIPPED_MASK +#undef SWAB + + +#if STRINGLIB_MAX_CHAR >= 0x80 +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, + Py_ssize_t len, + unsigned short **outptr, + int native_ordering) +{ + unsigned short *out = *outptr; + const STRINGLIB_CHAR *end = in + len; +#if STRINGLIB_SIZEOF_CHAR == 1 + if (native_ordering) { + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } + while (in < end) { + *out++ = *in++; + } + } else { +# define SWAB2(CH) ((CH) << 8) /* high byte is zero */ + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + out[0] = SWAB2(in[0]); + out[1] = SWAB2(in[1]); + out[2] = SWAB2(in[2]); + out[3] = SWAB2(in[3]); + in += 4; out += 4; + } + while (in < end) { + Py_UCS4 ch = *in++; + *out++ = SWAB2((Py_UCS2)ch); + } +#undef SWAB2 + } + *outptr = out; + return len; +#else + if (native_ordering) { +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } +#endif + while (in < end) { + Py_UCS4 ch; + ch = *in++; + if (ch < 0xd800) + *out++ = ch; + else if (ch < 0xe000) + /* reject surrogate characters (U+D800-U+DFFF) */ + goto fail; +#if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { + out[0] = Py_UNICODE_HIGH_SURROGATE(ch); + out[1] = Py_UNICODE_LOW_SURROGATE(ch); + out += 2; + } +#endif + else + *out++ = ch; + } + } else { +#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) +#if STRINGLIB_MAX_CHAR < 0x10000 + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; + out[0] = SWAB2(in[0]); + out[1] = SWAB2(in[1]); + out[2] = SWAB2(in[2]); + out[3] = SWAB2(in[3]); + in += 4; out += 4; + } +#endif + while (in < end) { + Py_UCS4 ch = *in++; + if (ch < 0xd800) + *out++ = SWAB2((Py_UCS2)ch); + else if (ch < 0xe000) + /* reject surrogate characters (U+D800-U+DFFF) */ + goto fail; +#if STRINGLIB_MAX_CHAR >= 0x10000 + else if (ch >= 0x10000) { + Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); + Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); + out[0] = SWAB2(ch1); + out[1] = SWAB2(ch2); + out += 2; + } +#endif + else + *out++ = SWAB2((Py_UCS2)ch); + } +#undef SWAB2 + } + *outptr = out; + return len; + fail: + *outptr = out; + return len - (end - in + 1); +#endif +} + static inline uint32_t STRINGLIB(SWAB4)(STRINGLIB_CHAR ch) { uint32_t word = ch; -#if STRINGLIB_SIZEOF_CHAR == 1 +#if STRINGLIB_SIZEOF_CHAR == 1 /* high bytes are zero */ return (word << 24); -#elif STRINGLIB_SIZEOF_CHAR == 2 +#elif STRINGLIB_SIZEOF_CHAR == 2 /* high bytes are zero */ return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8); -#else +#else return _Py_bswap32(word); -#endif +#endif } -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, - Py_ssize_t len, +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, + Py_ssize_t len, uint32_t **outptr, - int native_ordering) -{ + int native_ordering) +{ uint32_t *out = *outptr; - const STRINGLIB_CHAR *end = in + len; - if (native_ordering) { - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { -#if STRINGLIB_SIZEOF_CHAR > 1 - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; -#endif - out[0] = in[0]; - out[1] = in[1]; - out[2] = in[2]; - out[3] = in[3]; - in += 4; out += 4; - } - while (in < end) { - Py_UCS4 ch; - ch = *in++; -#if STRINGLIB_SIZEOF_CHAR > 1 - if (Py_UNICODE_IS_SURROGATE(ch)) { - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; - } -#endif - *out++ = ch; - } - } else { - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { -#if STRINGLIB_SIZEOF_CHAR > 1 - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; -#endif + const STRINGLIB_CHAR *end = in + len; + if (native_ordering) { + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { +#if STRINGLIB_SIZEOF_CHAR > 1 + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +#endif + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + in += 4; out += 4; + } + while (in < end) { + Py_UCS4 ch; + ch = *in++; +#if STRINGLIB_SIZEOF_CHAR > 1 + if (Py_UNICODE_IS_SURROGATE(ch)) { + /* reject surrogate characters (U+D800-U+DFFF) */ + goto fail; + } +#endif + *out++ = ch; + } + } else { + const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); + while (in < unrolled_end) { +#if STRINGLIB_SIZEOF_CHAR > 1 + /* check if any character is a surrogate character */ + if (((in[0] ^ 0xd800) & + (in[1] ^ 0xd800) & + (in[2] ^ 0xd800) & + (in[3] ^ 0xd800) & 0xf800) == 0) + break; +#endif out[0] = STRINGLIB(SWAB4)(in[0]); out[1] = STRINGLIB(SWAB4)(in[1]); out[2] = STRINGLIB(SWAB4)(in[2]); out[3] = STRINGLIB(SWAB4)(in[3]); - in += 4; out += 4; - } - while (in < end) { - Py_UCS4 ch = *in++; -#if STRINGLIB_SIZEOF_CHAR > 1 - if (Py_UNICODE_IS_SURROGATE(ch)) { - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; - } -#endif + in += 4; out += 4; + } + while (in < end) { + Py_UCS4 ch = *in++; +#if STRINGLIB_SIZEOF_CHAR > 1 + if (Py_UNICODE_IS_SURROGATE(ch)) { + /* reject surrogate characters (U+D800-U+DFFF) */ + goto fail; + } +#endif *out++ = STRINGLIB(SWAB4)(ch); - } - } - *outptr = out; - return len; -#if STRINGLIB_SIZEOF_CHAR > 1 - fail: - *outptr = out; - return len - (end - in + 1); -#endif -} - -#endif + } + } + *outptr = out; + return len; +#if STRINGLIB_SIZEOF_CHAR > 1 + fail: + *outptr = out; + return len - (end - in + 1); +#endif +} + +#endif diff --git a/contrib/tools/python3/src/Objects/stringlib/count.h b/contrib/tools/python3/src/Objects/stringlib/count.h index 794224d015..f48500bf56 100644 --- a/contrib/tools/python3/src/Objects/stringlib/count.h +++ b/contrib/tools/python3/src/Objects/stringlib/count.h @@ -1,27 +1,27 @@ -/* stringlib: count implementation */ - -#ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module -#endif - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(count)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, - Py_ssize_t maxcount) -{ - Py_ssize_t count; - - if (str_len < 0) - return 0; /* start > len(str) */ - if (sub_len == 0) - return (str_len < maxcount) ? str_len + 1 : maxcount; - - count = FASTSEARCH(str, str_len, sub, sub_len, maxcount, FAST_COUNT); - - if (count < 0) - return 0; /* no match */ - - return count; -} - - +/* stringlib: count implementation */ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(count)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, + Py_ssize_t maxcount) +{ + Py_ssize_t count; + + if (str_len < 0) + return 0; /* start > len(str) */ + if (sub_len == 0) + return (str_len < maxcount) ? str_len + 1 : maxcount; + + count = FASTSEARCH(str, str_len, sub, sub_len, maxcount, FAST_COUNT); + + if (count < 0) + return 0; /* no match */ + + return count; +} + + diff --git a/contrib/tools/python3/src/Objects/stringlib/ctype.h b/contrib/tools/python3/src/Objects/stringlib/ctype.h index 466624dac7..9b319b07d1 100644 --- a/contrib/tools/python3/src/Objects/stringlib/ctype.h +++ b/contrib/tools/python3/src/Objects/stringlib/ctype.h @@ -1,116 +1,116 @@ -#if STRINGLIB_IS_UNICODE -# error "ctype.h only compatible with byte-wise strings" -#endif - +#if STRINGLIB_IS_UNICODE +# error "ctype.h only compatible with byte-wise strings" +#endif + #include "pycore_bytes_methods.h" - -static PyObject* + +static PyObject* stringlib_isspace(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_isspace(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* +{ + return _Py_bytes_isspace(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + +static PyObject* stringlib_isalpha(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_isalpha(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* +{ + return _Py_bytes_isalpha(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + +static PyObject* stringlib_isalnum(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_isalnum(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* +{ + return _Py_bytes_isalnum(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + +static PyObject* stringlib_isascii(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_isascii(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* +{ + return _Py_bytes_isascii(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + +static PyObject* stringlib_isdigit(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_isdigit(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* +{ + return _Py_bytes_isdigit(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + +static PyObject* stringlib_islower(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_islower(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* +{ + return _Py_bytes_islower(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + +static PyObject* stringlib_isupper(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_isupper(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* +{ + return _Py_bytes_isupper(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + +static PyObject* stringlib_istitle(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_istitle(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - - -/* functions that return a new object partially translated by ctype funcs: */ - -static PyObject* +{ + return _Py_bytes_istitle(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + + +/* functions that return a new object partially translated by ctype funcs: */ + +static PyObject* stringlib_lower(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - PyObject* newobj; - newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); - if (!newobj) - return NULL; - _Py_bytes_lower(STRINGLIB_STR(newobj), STRINGLIB_STR(self), - STRINGLIB_LEN(self)); - return newobj; -} - -static PyObject* +{ + PyObject* newobj; + newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); + if (!newobj) + return NULL; + _Py_bytes_lower(STRINGLIB_STR(newobj), STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + return newobj; +} + +static PyObject* stringlib_upper(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - PyObject* newobj; - newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); - if (!newobj) - return NULL; - _Py_bytes_upper(STRINGLIB_STR(newobj), STRINGLIB_STR(self), - STRINGLIB_LEN(self)); - return newobj; -} - -static PyObject* +{ + PyObject* newobj; + newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); + if (!newobj) + return NULL; + _Py_bytes_upper(STRINGLIB_STR(newobj), STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + return newobj; +} + +static PyObject* stringlib_title(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - PyObject* newobj; - newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); - if (!newobj) - return NULL; - _Py_bytes_title(STRINGLIB_STR(newobj), STRINGLIB_STR(self), - STRINGLIB_LEN(self)); - return newobj; -} - -static PyObject* +{ + PyObject* newobj; + newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); + if (!newobj) + return NULL; + _Py_bytes_title(STRINGLIB_STR(newobj), STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + return newobj; +} + +static PyObject* stringlib_capitalize(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - PyObject* newobj; - newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); - if (!newobj) - return NULL; - _Py_bytes_capitalize(STRINGLIB_STR(newobj), STRINGLIB_STR(self), - STRINGLIB_LEN(self)); - return newobj; -} - -static PyObject* +{ + PyObject* newobj; + newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); + if (!newobj) + return NULL; + _Py_bytes_capitalize(STRINGLIB_STR(newobj), STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + return newobj; +} + +static PyObject* stringlib_swapcase(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - PyObject* newobj; - newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); - if (!newobj) - return NULL; - _Py_bytes_swapcase(STRINGLIB_STR(newobj), STRINGLIB_STR(self), - STRINGLIB_LEN(self)); - return newobj; -} +{ + PyObject* newobj; + newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); + if (!newobj) + return NULL; + _Py_bytes_swapcase(STRINGLIB_STR(newobj), STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + return newobj; +} diff --git a/contrib/tools/python3/src/Objects/stringlib/eq.h b/contrib/tools/python3/src/Objects/stringlib/eq.h index 4efd518157..9c1058b86c 100644 --- a/contrib/tools/python3/src/Objects/stringlib/eq.h +++ b/contrib/tools/python3/src/Objects/stringlib/eq.h @@ -1,25 +1,25 @@ -/* Fast unicode equal function optimized for dictobject.c and setobject.c */ - -/* Return 1 if two unicode objects are equal, 0 if not. - * unicode_eq() is called when the hash of two unicode objects is equal. - */ -Py_LOCAL_INLINE(int) -unicode_eq(PyObject *aa, PyObject *bb) -{ +/* Fast unicode equal function optimized for dictobject.c and setobject.c */ + +/* Return 1 if two unicode objects are equal, 0 if not. + * unicode_eq() is called when the hash of two unicode objects is equal. + */ +Py_LOCAL_INLINE(int) +unicode_eq(PyObject *aa, PyObject *bb) +{ assert(PyUnicode_Check(aa)); assert(PyUnicode_Check(bb)); assert(PyUnicode_IS_READY(aa)); assert(PyUnicode_IS_READY(bb)); - PyUnicodeObject *a = (PyUnicodeObject *)aa; - PyUnicodeObject *b = (PyUnicodeObject *)bb; - - if (PyUnicode_GET_LENGTH(a) != PyUnicode_GET_LENGTH(b)) - return 0; - if (PyUnicode_GET_LENGTH(a) == 0) - return 1; - if (PyUnicode_KIND(a) != PyUnicode_KIND(b)) - return 0; - return memcmp(PyUnicode_1BYTE_DATA(a), PyUnicode_1BYTE_DATA(b), - PyUnicode_GET_LENGTH(a) * PyUnicode_KIND(a)) == 0; -} + PyUnicodeObject *a = (PyUnicodeObject *)aa; + PyUnicodeObject *b = (PyUnicodeObject *)bb; + + if (PyUnicode_GET_LENGTH(a) != PyUnicode_GET_LENGTH(b)) + return 0; + if (PyUnicode_GET_LENGTH(a) == 0) + return 1; + if (PyUnicode_KIND(a) != PyUnicode_KIND(b)) + return 0; + return memcmp(PyUnicode_1BYTE_DATA(a), PyUnicode_1BYTE_DATA(b), + PyUnicode_GET_LENGTH(a) * PyUnicode_KIND(a)) == 0; +} diff --git a/contrib/tools/python3/src/Objects/stringlib/fastsearch.h b/contrib/tools/python3/src/Objects/stringlib/fastsearch.h index 5ed40b3469..56a4467d35 100644 --- a/contrib/tools/python3/src/Objects/stringlib/fastsearch.h +++ b/contrib/tools/python3/src/Objects/stringlib/fastsearch.h @@ -1,283 +1,283 @@ -/* stringlib: fastsearch implementation */ - -#define STRINGLIB_FASTSEARCH_H - -/* fast search/count implementation, based on a mix between boyer- - moore and horspool, with a few more bells and whistles on the top. - for some more background, see: http://effbot.org/zone/stringlib.htm */ - -/* note: fastsearch may access s[n], which isn't a problem when using - Python's ordinary string types, but may cause problems if you're - using this code in other contexts. also, the count mode returns -1 - if there cannot possible be a match in the target string, and 0 if - it has actually checked for matches, but didn't find any. callers - beware! */ - -#define FAST_COUNT 0 -#define FAST_SEARCH 1 -#define FAST_RSEARCH 2 - -#if LONG_BIT >= 128 -#define STRINGLIB_BLOOM_WIDTH 128 -#elif LONG_BIT >= 64 -#define STRINGLIB_BLOOM_WIDTH 64 -#elif LONG_BIT >= 32 -#define STRINGLIB_BLOOM_WIDTH 32 -#else -#error "LONG_BIT is smaller than 32" -#endif - -#define STRINGLIB_BLOOM_ADD(mask, ch) \ - ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) -#define STRINGLIB_BLOOM(mask, ch) \ - ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) - -#if STRINGLIB_SIZEOF_CHAR == 1 -# define MEMCHR_CUT_OFF 15 -#else -# define MEMCHR_CUT_OFF 40 -#endif - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) -{ - const STRINGLIB_CHAR *p, *e; - - p = s; - e = s + n; - if (n > MEMCHR_CUT_OFF) { -#if STRINGLIB_SIZEOF_CHAR == 1 - p = memchr(s, ch, n); - if (p != NULL) - return (p - s); - return -1; -#else +/* stringlib: fastsearch implementation */ + +#define STRINGLIB_FASTSEARCH_H + +/* fast search/count implementation, based on a mix between boyer- + moore and horspool, with a few more bells and whistles on the top. + for some more background, see: http://effbot.org/zone/stringlib.htm */ + +/* note: fastsearch may access s[n], which isn't a problem when using + Python's ordinary string types, but may cause problems if you're + using this code in other contexts. also, the count mode returns -1 + if there cannot possible be a match in the target string, and 0 if + it has actually checked for matches, but didn't find any. callers + beware! */ + +#define FAST_COUNT 0 +#define FAST_SEARCH 1 +#define FAST_RSEARCH 2 + +#if LONG_BIT >= 128 +#define STRINGLIB_BLOOM_WIDTH 128 +#elif LONG_BIT >= 64 +#define STRINGLIB_BLOOM_WIDTH 64 +#elif LONG_BIT >= 32 +#define STRINGLIB_BLOOM_WIDTH 32 +#else +#error "LONG_BIT is smaller than 32" +#endif + +#define STRINGLIB_BLOOM_ADD(mask, ch) \ + ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) +#define STRINGLIB_BLOOM(mask, ch) \ + ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) + +#if STRINGLIB_SIZEOF_CHAR == 1 +# define MEMCHR_CUT_OFF 15 +#else +# define MEMCHR_CUT_OFF 40 +#endif + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) +{ + const STRINGLIB_CHAR *p, *e; + + p = s; + e = s + n; + if (n > MEMCHR_CUT_OFF) { +#if STRINGLIB_SIZEOF_CHAR == 1 + p = memchr(s, ch, n); + if (p != NULL) + return (p - s); + return -1; +#else /* use memchr if we can choose a needle without too many likely - false positives */ - const STRINGLIB_CHAR *s1, *e1; - unsigned char needle = ch & 0xff; - /* If looking for a multiple of 256, we'd have too - many false positives looking for the '\0' byte in UCS2 - and UCS4 representations. */ - if (needle != 0) { - do { - void *candidate = memchr(p, needle, - (e - p) * sizeof(STRINGLIB_CHAR)); - if (candidate == NULL) - return -1; - s1 = p; - p = (const STRINGLIB_CHAR *) - _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); - if (*p == ch) - return (p - s); - /* False positive */ - p++; - if (p - s1 > MEMCHR_CUT_OFF) - continue; - if (e - p <= MEMCHR_CUT_OFF) - break; - e1 = p + MEMCHR_CUT_OFF; - while (p != e1) { - if (*p == ch) - return (p - s); - p++; - } - } - while (e - p > MEMCHR_CUT_OFF); - } -#endif - } - while (p < e) { - if (*p == ch) - return (p - s); - p++; - } - return -1; -} - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) -{ - const STRINGLIB_CHAR *p; -#ifdef HAVE_MEMRCHR - /* memrchr() is a GNU extension, available since glibc 2.1.91. - it doesn't seem as optimized as memchr(), but is still quite - faster than our hand-written loop below */ - - if (n > MEMCHR_CUT_OFF) { -#if STRINGLIB_SIZEOF_CHAR == 1 - p = memrchr(s, ch, n); - if (p != NULL) - return (p - s); - return -1; -#else + false positives */ + const STRINGLIB_CHAR *s1, *e1; + unsigned char needle = ch & 0xff; + /* If looking for a multiple of 256, we'd have too + many false positives looking for the '\0' byte in UCS2 + and UCS4 representations. */ + if (needle != 0) { + do { + void *candidate = memchr(p, needle, + (e - p) * sizeof(STRINGLIB_CHAR)); + if (candidate == NULL) + return -1; + s1 = p; + p = (const STRINGLIB_CHAR *) + _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); + if (*p == ch) + return (p - s); + /* False positive */ + p++; + if (p - s1 > MEMCHR_CUT_OFF) + continue; + if (e - p <= MEMCHR_CUT_OFF) + break; + e1 = p + MEMCHR_CUT_OFF; + while (p != e1) { + if (*p == ch) + return (p - s); + p++; + } + } + while (e - p > MEMCHR_CUT_OFF); + } +#endif + } + while (p < e) { + if (*p == ch) + return (p - s); + p++; + } + return -1; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) +{ + const STRINGLIB_CHAR *p; +#ifdef HAVE_MEMRCHR + /* memrchr() is a GNU extension, available since glibc 2.1.91. + it doesn't seem as optimized as memchr(), but is still quite + faster than our hand-written loop below */ + + if (n > MEMCHR_CUT_OFF) { +#if STRINGLIB_SIZEOF_CHAR == 1 + p = memrchr(s, ch, n); + if (p != NULL) + return (p - s); + return -1; +#else /* use memrchr if we can choose a needle without too many likely - false positives */ - const STRINGLIB_CHAR *s1; - Py_ssize_t n1; - unsigned char needle = ch & 0xff; - /* If looking for a multiple of 256, we'd have too - many false positives looking for the '\0' byte in UCS2 - and UCS4 representations. */ - if (needle != 0) { - do { - void *candidate = memrchr(s, needle, - n * sizeof(STRINGLIB_CHAR)); - if (candidate == NULL) - return -1; - n1 = n; - p = (const STRINGLIB_CHAR *) - _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); - n = p - s; - if (*p == ch) - return n; - /* False positive */ - if (n1 - n > MEMCHR_CUT_OFF) - continue; - if (n <= MEMCHR_CUT_OFF) - break; - s1 = p - MEMCHR_CUT_OFF; - while (p > s1) { - p--; - if (*p == ch) - return (p - s); - } - n = p - s; - } - while (n > MEMCHR_CUT_OFF); - } -#endif - } -#endif /* HAVE_MEMRCHR */ - p = s + n; - while (p > s) { - p--; - if (*p == ch) - return (p - s); - } - return -1; -} - -#undef MEMCHR_CUT_OFF - -Py_LOCAL_INLINE(Py_ssize_t) -FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) -{ - unsigned long mask; - Py_ssize_t skip, count = 0; - Py_ssize_t i, j, mlast, w; - - w = n - m; - - if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) - return -1; - - /* look for special cases */ - if (m <= 1) { - if (m <= 0) - return -1; - /* use special case for 1-character strings */ - if (mode == FAST_SEARCH) - return STRINGLIB(find_char)(s, n, p[0]); - else if (mode == FAST_RSEARCH) - return STRINGLIB(rfind_char)(s, n, p[0]); - else { /* FAST_COUNT */ - for (i = 0; i < n; i++) - if (s[i] == p[0]) { - count++; - if (count == maxcount) - return maxcount; - } - return count; - } - } - - mlast = m - 1; - skip = mlast - 1; - mask = 0; - - if (mode != FAST_RSEARCH) { - const STRINGLIB_CHAR *ss = s + m - 1; - const STRINGLIB_CHAR *pp = p + m - 1; - - /* create compressed boyer-moore delta 1 table */ - - /* process pattern[:-1] */ - for (i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[mlast]) - skip = mlast - i - 1; - } - /* process pattern[-1] outside the loop */ - STRINGLIB_BLOOM_ADD(mask, p[mlast]); - - for (i = 0; i <= w; i++) { - /* note: using mlast in the skip path slows things down on x86 */ - if (ss[i] == pp[0]) { - /* candidate match */ - for (j = 0; j < mlast; j++) - if (s[i+j] != p[j]) - break; - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) - return i; - count++; - if (count == maxcount) - return maxcount; - i = i + mlast; - continue; - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) - i = i + m; - else - i = i + skip; - } else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) - i = i + m; - } - } - } else { /* FAST_RSEARCH */ - - /* create compressed boyer-moore delta 1 table */ - - /* process pattern[0] outside the loop */ - STRINGLIB_BLOOM_ADD(mask, p[0]); - /* process pattern[:0:-1] */ - for (i = mlast; i > 0; i--) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[0]) - skip = i - 1; - } - - for (i = w; i >= 0; i--) { - if (s[i] == p[0]) { - /* candidate match */ - for (j = mlast; j > 0; j--) - if (s[i+j] != p[j]) - break; - if (j == 0) - /* got a match! */ - return i; - /* miss: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) - i = i - m; - else - i = i - skip; - } else { - /* skip: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) - i = i - m; - } - } - } - - if (mode != FAST_COUNT) - return -1; - return count; -} - + false positives */ + const STRINGLIB_CHAR *s1; + Py_ssize_t n1; + unsigned char needle = ch & 0xff; + /* If looking for a multiple of 256, we'd have too + many false positives looking for the '\0' byte in UCS2 + and UCS4 representations. */ + if (needle != 0) { + do { + void *candidate = memrchr(s, needle, + n * sizeof(STRINGLIB_CHAR)); + if (candidate == NULL) + return -1; + n1 = n; + p = (const STRINGLIB_CHAR *) + _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); + n = p - s; + if (*p == ch) + return n; + /* False positive */ + if (n1 - n > MEMCHR_CUT_OFF) + continue; + if (n <= MEMCHR_CUT_OFF) + break; + s1 = p - MEMCHR_CUT_OFF; + while (p > s1) { + p--; + if (*p == ch) + return (p - s); + } + n = p - s; + } + while (n > MEMCHR_CUT_OFF); + } +#endif + } +#endif /* HAVE_MEMRCHR */ + p = s + n; + while (p > s) { + p--; + if (*p == ch) + return (p - s); + } + return -1; +} + +#undef MEMCHR_CUT_OFF + +Py_LOCAL_INLINE(Py_ssize_t) +FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode) +{ + unsigned long mask; + Py_ssize_t skip, count = 0; + Py_ssize_t i, j, mlast, w; + + w = n - m; + + if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) + return -1; + + /* look for special cases */ + if (m <= 1) { + if (m <= 0) + return -1; + /* use special case for 1-character strings */ + if (mode == FAST_SEARCH) + return STRINGLIB(find_char)(s, n, p[0]); + else if (mode == FAST_RSEARCH) + return STRINGLIB(rfind_char)(s, n, p[0]); + else { /* FAST_COUNT */ + for (i = 0; i < n; i++) + if (s[i] == p[0]) { + count++; + if (count == maxcount) + return maxcount; + } + return count; + } + } + + mlast = m - 1; + skip = mlast - 1; + mask = 0; + + if (mode != FAST_RSEARCH) { + const STRINGLIB_CHAR *ss = s + m - 1; + const STRINGLIB_CHAR *pp = p + m - 1; + + /* create compressed boyer-moore delta 1 table */ + + /* process pattern[:-1] */ + for (i = 0; i < mlast; i++) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == p[mlast]) + skip = mlast - i - 1; + } + /* process pattern[-1] outside the loop */ + STRINGLIB_BLOOM_ADD(mask, p[mlast]); + + for (i = 0; i <= w; i++) { + /* note: using mlast in the skip path slows things down on x86 */ + if (ss[i] == pp[0]) { + /* candidate match */ + for (j = 0; j < mlast; j++) + if (s[i+j] != p[j]) + break; + if (j == mlast) { + /* got a match! */ + if (mode != FAST_COUNT) + return i; + count++; + if (count == maxcount) + return maxcount; + i = i + mlast; + continue; + } + /* miss: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) + i = i + m; + else + i = i + skip; + } else { + /* skip: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) + i = i + m; + } + } + } else { /* FAST_RSEARCH */ + + /* create compressed boyer-moore delta 1 table */ + + /* process pattern[0] outside the loop */ + STRINGLIB_BLOOM_ADD(mask, p[0]); + /* process pattern[:0:-1] */ + for (i = mlast; i > 0; i--) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == p[0]) + skip = i - 1; + } + + for (i = w; i >= 0; i--) { + if (s[i] == p[0]) { + /* candidate match */ + for (j = mlast; j > 0; j--) + if (s[i+j] != p[j]) + break; + if (j == 0) + /* got a match! */ + return i; + /* miss: check if previous character is part of pattern */ + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + i = i - m; + else + i = i - skip; + } else { + /* skip: check if previous character is part of pattern */ + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + i = i - m; + } + } + } + + if (mode != FAST_COUNT) + return -1; + return count; +} + diff --git a/contrib/tools/python3/src/Objects/stringlib/find.h b/contrib/tools/python3/src/Objects/stringlib/find.h index 9ca4256e40..509b929739 100644 --- a/contrib/tools/python3/src/Objects/stringlib/find.h +++ b/contrib/tools/python3/src/Objects/stringlib/find.h @@ -1,119 +1,119 @@ -/* stringlib: find/index implementation */ - -#ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module -#endif - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(find)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, - Py_ssize_t offset) -{ - Py_ssize_t pos; - - assert(str_len >= 0); - if (sub_len == 0) - return offset; - - pos = FASTSEARCH(str, str_len, sub, sub_len, -1, FAST_SEARCH); - - if (pos >= 0) - pos += offset; - - return pos; -} - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(rfind)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, - Py_ssize_t offset) -{ - Py_ssize_t pos; - - assert(str_len >= 0); - if (sub_len == 0) - return str_len + offset; - - pos = FASTSEARCH(str, str_len, sub, sub_len, -1, FAST_RSEARCH); - - if (pos >= 0) - pos += offset; - - return pos; -} - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(find_slice)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, - Py_ssize_t start, Py_ssize_t end) -{ - return STRINGLIB(find)(str + start, end - start, sub, sub_len, start); -} - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(rfind_slice)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, - Py_ssize_t start, Py_ssize_t end) -{ - return STRINGLIB(rfind)(str + start, end - start, sub, sub_len, start); -} - -#ifdef STRINGLIB_WANT_CONTAINS_OBJ - -Py_LOCAL_INLINE(int) -STRINGLIB(contains_obj)(PyObject* str, PyObject* sub) -{ - return STRINGLIB(find)( - STRINGLIB_STR(str), STRINGLIB_LEN(str), - STRINGLIB_STR(sub), STRINGLIB_LEN(sub), 0 - ) != -1; -} - -#endif /* STRINGLIB_WANT_CONTAINS_OBJ */ - -/* -This function is a helper for the "find" family (find, rfind, index, -rindex) and for count, startswith and endswith, because they all have -the same behaviour for the arguments. - -It does not touch the variables received until it knows everything -is ok. -*/ - -#define FORMAT_BUFFER_SIZE 50 - -Py_LOCAL_INLINE(int) -STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args, - PyObject **subobj, - Py_ssize_t *start, Py_ssize_t *end) -{ - PyObject *tmp_subobj; - Py_ssize_t tmp_start = 0; - Py_ssize_t tmp_end = PY_SSIZE_T_MAX; - PyObject *obj_start=Py_None, *obj_end=Py_None; - char format[FORMAT_BUFFER_SIZE] = "O|OO:"; - size_t len = strlen(format); - - strncpy(format + len, function_name, FORMAT_BUFFER_SIZE - len - 1); - format[FORMAT_BUFFER_SIZE - 1] = '\0'; - - if (!PyArg_ParseTuple(args, format, &tmp_subobj, &obj_start, &obj_end)) - return 0; - - /* To support None in "start" and "end" arguments, meaning - the same as if they were not passed. - */ - if (obj_start != Py_None) - if (!_PyEval_SliceIndex(obj_start, &tmp_start)) - return 0; - if (obj_end != Py_None) - if (!_PyEval_SliceIndex(obj_end, &tmp_end)) - return 0; - - *start = tmp_start; - *end = tmp_end; - *subobj = tmp_subobj; - return 1; -} - -#undef FORMAT_BUFFER_SIZE +/* stringlib: find/index implementation */ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(find)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, + Py_ssize_t offset) +{ + Py_ssize_t pos; + + assert(str_len >= 0); + if (sub_len == 0) + return offset; + + pos = FASTSEARCH(str, str_len, sub, sub_len, -1, FAST_SEARCH); + + if (pos >= 0) + pos += offset; + + return pos; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(rfind)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, + Py_ssize_t offset) +{ + Py_ssize_t pos; + + assert(str_len >= 0); + if (sub_len == 0) + return str_len + offset; + + pos = FASTSEARCH(str, str_len, sub, sub_len, -1, FAST_RSEARCH); + + if (pos >= 0) + pos += offset; + + return pos; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(find_slice)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, + Py_ssize_t start, Py_ssize_t end) +{ + return STRINGLIB(find)(str + start, end - start, sub, sub_len, start); +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(rfind_slice)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, + Py_ssize_t start, Py_ssize_t end) +{ + return STRINGLIB(rfind)(str + start, end - start, sub, sub_len, start); +} + +#ifdef STRINGLIB_WANT_CONTAINS_OBJ + +Py_LOCAL_INLINE(int) +STRINGLIB(contains_obj)(PyObject* str, PyObject* sub) +{ + return STRINGLIB(find)( + STRINGLIB_STR(str), STRINGLIB_LEN(str), + STRINGLIB_STR(sub), STRINGLIB_LEN(sub), 0 + ) != -1; +} + +#endif /* STRINGLIB_WANT_CONTAINS_OBJ */ + +/* +This function is a helper for the "find" family (find, rfind, index, +rindex) and for count, startswith and endswith, because they all have +the same behaviour for the arguments. + +It does not touch the variables received until it knows everything +is ok. +*/ + +#define FORMAT_BUFFER_SIZE 50 + +Py_LOCAL_INLINE(int) +STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args, + PyObject **subobj, + Py_ssize_t *start, Py_ssize_t *end) +{ + PyObject *tmp_subobj; + Py_ssize_t tmp_start = 0; + Py_ssize_t tmp_end = PY_SSIZE_T_MAX; + PyObject *obj_start=Py_None, *obj_end=Py_None; + char format[FORMAT_BUFFER_SIZE] = "O|OO:"; + size_t len = strlen(format); + + strncpy(format + len, function_name, FORMAT_BUFFER_SIZE - len - 1); + format[FORMAT_BUFFER_SIZE - 1] = '\0'; + + if (!PyArg_ParseTuple(args, format, &tmp_subobj, &obj_start, &obj_end)) + return 0; + + /* To support None in "start" and "end" arguments, meaning + the same as if they were not passed. + */ + if (obj_start != Py_None) + if (!_PyEval_SliceIndex(obj_start, &tmp_start)) + return 0; + if (obj_end != Py_None) + if (!_PyEval_SliceIndex(obj_end, &tmp_end)) + return 0; + + *start = tmp_start; + *end = tmp_end; + *subobj = tmp_subobj; + return 1; +} + +#undef FORMAT_BUFFER_SIZE diff --git a/contrib/tools/python3/src/Objects/stringlib/find_max_char.h b/contrib/tools/python3/src/Objects/stringlib/find_max_char.h index 608bc37a43..f4e0a7761d 100644 --- a/contrib/tools/python3/src/Objects/stringlib/find_max_char.h +++ b/contrib/tools/python3/src/Objects/stringlib/find_max_char.h @@ -1,134 +1,134 @@ -/* Finding the optimal width of unicode characters in a buffer */ - -#if !STRINGLIB_IS_UNICODE -# error "find_max_char.h is specific to Unicode" -#endif - -/* Mask to quickly check whether a C 'long' contains a - non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_LONG == 8) -# define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL -#elif (SIZEOF_LONG == 4) -# define UCS1_ASCII_CHAR_MASK 0x80808080UL -#else -# error C 'long' size should be either 4 or 8! -#endif - -#if STRINGLIB_SIZEOF_CHAR == 1 - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) -{ - const unsigned char *p = (const unsigned char *) begin; - const unsigned char *aligned_end = - (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); - - while (p < end) { - if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { - /* Help register allocation */ - const unsigned char *_p = p; - while (_p < aligned_end) { +/* Finding the optimal width of unicode characters in a buffer */ + +#if !STRINGLIB_IS_UNICODE +# error "find_max_char.h is specific to Unicode" +#endif + +/* Mask to quickly check whether a C 'long' contains a + non-ASCII, UTF8-encoded char. */ +#if (SIZEOF_LONG == 8) +# define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL +#elif (SIZEOF_LONG == 4) +# define UCS1_ASCII_CHAR_MASK 0x80808080UL +#else +# error C 'long' size should be either 4 or 8! +#endif + +#if STRINGLIB_SIZEOF_CHAR == 1 + +Py_LOCAL_INLINE(Py_UCS4) +STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) +{ + const unsigned char *p = (const unsigned char *) begin; + const unsigned char *aligned_end = + (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); + + while (p < end) { + if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { + /* Help register allocation */ + const unsigned char *_p = p; + while (_p < aligned_end) { unsigned long value = *(const unsigned long *) _p; - if (value & UCS1_ASCII_CHAR_MASK) - return 255; - _p += SIZEOF_LONG; - } - p = _p; - if (p == end) - break; - } - if (*p++ & 0x80) - return 255; - } - return 127; -} - -#undef ASCII_CHAR_MASK - -#else /* STRINGLIB_SIZEOF_CHAR == 1 */ - -#define MASK_ASCII 0xFFFFFF80 -#define MASK_UCS1 0xFFFFFF00 -#define MASK_UCS2 0xFFFF0000 - -#define MAX_CHAR_ASCII 0x7f -#define MAX_CHAR_UCS1 0xff -#define MAX_CHAR_UCS2 0xffff -#define MAX_CHAR_UCS4 0x10ffff - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) -{ -#if STRINGLIB_SIZEOF_CHAR == 2 - const Py_UCS4 mask_limit = MASK_UCS1; - const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; -#elif STRINGLIB_SIZEOF_CHAR == 4 - const Py_UCS4 mask_limit = MASK_UCS2; - const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; -#else -#error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) -#endif - Py_UCS4 mask; - Py_ssize_t n = end - begin; - const STRINGLIB_CHAR *p = begin; - const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); - Py_UCS4 max_char; - - max_char = MAX_CHAR_ASCII; - mask = MASK_ASCII; - while (p < unrolled_end) { - STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; - if (bits & mask) { - if (mask == mask_limit) { - /* Limit reached */ - return max_char_limit; - } - if (mask == MASK_ASCII) { - max_char = MAX_CHAR_UCS1; - mask = MASK_UCS1; - } - else { - /* mask can't be MASK_UCS2 because of mask_limit above */ - assert(mask == MASK_UCS1); - max_char = MAX_CHAR_UCS2; - mask = MASK_UCS2; - } - /* We check the new mask on the same chars in the next iteration */ - continue; - } - p += 4; - } - while (p < end) { - if (p[0] & mask) { - if (mask == mask_limit) { - /* Limit reached */ - return max_char_limit; - } - if (mask == MASK_ASCII) { - max_char = MAX_CHAR_UCS1; - mask = MASK_UCS1; - } - else { - /* mask can't be MASK_UCS2 because of mask_limit above */ - assert(mask == MASK_UCS1); - max_char = MAX_CHAR_UCS2; - mask = MASK_UCS2; - } - /* We check the new mask on the same chars in the next iteration */ - continue; - } - p++; - } - return max_char; -} - -#undef MASK_ASCII -#undef MASK_UCS1 -#undef MASK_UCS2 -#undef MAX_CHAR_ASCII -#undef MAX_CHAR_UCS1 -#undef MAX_CHAR_UCS2 -#undef MAX_CHAR_UCS4 - -#endif /* STRINGLIB_SIZEOF_CHAR == 1 */ - + if (value & UCS1_ASCII_CHAR_MASK) + return 255; + _p += SIZEOF_LONG; + } + p = _p; + if (p == end) + break; + } + if (*p++ & 0x80) + return 255; + } + return 127; +} + +#undef ASCII_CHAR_MASK + +#else /* STRINGLIB_SIZEOF_CHAR == 1 */ + +#define MASK_ASCII 0xFFFFFF80 +#define MASK_UCS1 0xFFFFFF00 +#define MASK_UCS2 0xFFFF0000 + +#define MAX_CHAR_ASCII 0x7f +#define MAX_CHAR_UCS1 0xff +#define MAX_CHAR_UCS2 0xffff +#define MAX_CHAR_UCS4 0x10ffff + +Py_LOCAL_INLINE(Py_UCS4) +STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) +{ +#if STRINGLIB_SIZEOF_CHAR == 2 + const Py_UCS4 mask_limit = MASK_UCS1; + const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; +#elif STRINGLIB_SIZEOF_CHAR == 4 + const Py_UCS4 mask_limit = MASK_UCS2; + const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; +#else +#error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) +#endif + Py_UCS4 mask; + Py_ssize_t n = end - begin; + const STRINGLIB_CHAR *p = begin; + const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); + Py_UCS4 max_char; + + max_char = MAX_CHAR_ASCII; + mask = MASK_ASCII; + while (p < unrolled_end) { + STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; + if (bits & mask) { + if (mask == mask_limit) { + /* Limit reached */ + return max_char_limit; + } + if (mask == MASK_ASCII) { + max_char = MAX_CHAR_UCS1; + mask = MASK_UCS1; + } + else { + /* mask can't be MASK_UCS2 because of mask_limit above */ + assert(mask == MASK_UCS1); + max_char = MAX_CHAR_UCS2; + mask = MASK_UCS2; + } + /* We check the new mask on the same chars in the next iteration */ + continue; + } + p += 4; + } + while (p < end) { + if (p[0] & mask) { + if (mask == mask_limit) { + /* Limit reached */ + return max_char_limit; + } + if (mask == MASK_ASCII) { + max_char = MAX_CHAR_UCS1; + mask = MASK_UCS1; + } + else { + /* mask can't be MASK_UCS2 because of mask_limit above */ + assert(mask == MASK_UCS1); + max_char = MAX_CHAR_UCS2; + mask = MASK_UCS2; + } + /* We check the new mask on the same chars in the next iteration */ + continue; + } + p++; + } + return max_char; +} + +#undef MASK_ASCII +#undef MASK_UCS1 +#undef MASK_UCS2 +#undef MAX_CHAR_ASCII +#undef MAX_CHAR_UCS1 +#undef MAX_CHAR_UCS2 +#undef MAX_CHAR_UCS4 + +#endif /* STRINGLIB_SIZEOF_CHAR == 1 */ + diff --git a/contrib/tools/python3/src/Objects/stringlib/join.h b/contrib/tools/python3/src/Objects/stringlib/join.h index acb39b497f..53bcbdea7a 100644 --- a/contrib/tools/python3/src/Objects/stringlib/join.h +++ b/contrib/tools/python3/src/Objects/stringlib/join.h @@ -1,73 +1,73 @@ -/* stringlib: bytes joining implementation */ - -#if STRINGLIB_IS_UNICODE -#error join.h only compatible with byte-wise strings -#endif - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable) -{ +/* stringlib: bytes joining implementation */ + +#if STRINGLIB_IS_UNICODE +#error join.h only compatible with byte-wise strings +#endif + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable) +{ const char *sepstr = STRINGLIB_STR(sep); Py_ssize_t seplen = STRINGLIB_LEN(sep); - PyObject *res = NULL; - char *p; - Py_ssize_t seqlen = 0; - Py_ssize_t sz = 0; - Py_ssize_t i, nbufs; - PyObject *seq, *item; - Py_buffer *buffers = NULL; -#define NB_STATIC_BUFFERS 10 - Py_buffer static_buffers[NB_STATIC_BUFFERS]; + PyObject *res = NULL; + char *p; + Py_ssize_t seqlen = 0; + Py_ssize_t sz = 0; + Py_ssize_t i, nbufs; + PyObject *seq, *item; + Py_buffer *buffers = NULL; +#define NB_STATIC_BUFFERS 10 + Py_buffer static_buffers[NB_STATIC_BUFFERS]; #define GIL_THRESHOLD 1048576 int drop_gil = 1; PyThreadState *save = NULL; - - seq = PySequence_Fast(iterable, "can only join an iterable"); - if (seq == NULL) { - return NULL; - } - - seqlen = PySequence_Fast_GET_SIZE(seq); - if (seqlen == 0) { - Py_DECREF(seq); - return STRINGLIB_NEW(NULL, 0); - } -#ifndef STRINGLIB_MUTABLE - if (seqlen == 1) { - item = PySequence_Fast_GET_ITEM(seq, 0); - if (STRINGLIB_CHECK_EXACT(item)) { - Py_INCREF(item); - Py_DECREF(seq); - return item; - } - } -#endif - if (seqlen > NB_STATIC_BUFFERS) { - buffers = PyMem_NEW(Py_buffer, seqlen); - if (buffers == NULL) { - Py_DECREF(seq); - PyErr_NoMemory(); - return NULL; - } - } - else { - buffers = static_buffers; - } - - /* Here is the general case. Do a pre-pass to figure out the total - * amount of space we'll need (sz), and see whether all arguments are - * bytes-like. - */ - for (i = 0, nbufs = 0; i < seqlen; i++) { - Py_ssize_t itemlen; - item = PySequence_Fast_GET_ITEM(seq, i); - if (PyBytes_CheckExact(item)) { - /* Fast path. */ - Py_INCREF(item); - buffers[i].obj = item; - buffers[i].buf = PyBytes_AS_STRING(item); - buffers[i].len = PyBytes_GET_SIZE(item); - } + + seq = PySequence_Fast(iterable, "can only join an iterable"); + if (seq == NULL) { + return NULL; + } + + seqlen = PySequence_Fast_GET_SIZE(seq); + if (seqlen == 0) { + Py_DECREF(seq); + return STRINGLIB_NEW(NULL, 0); + } +#ifndef STRINGLIB_MUTABLE + if (seqlen == 1) { + item = PySequence_Fast_GET_ITEM(seq, 0); + if (STRINGLIB_CHECK_EXACT(item)) { + Py_INCREF(item); + Py_DECREF(seq); + return item; + } + } +#endif + if (seqlen > NB_STATIC_BUFFERS) { + buffers = PyMem_NEW(Py_buffer, seqlen); + if (buffers == NULL) { + Py_DECREF(seq); + PyErr_NoMemory(); + return NULL; + } + } + else { + buffers = static_buffers; + } + + /* Here is the general case. Do a pre-pass to figure out the total + * amount of space we'll need (sz), and see whether all arguments are + * bytes-like. + */ + for (i = 0, nbufs = 0; i < seqlen; i++) { + Py_ssize_t itemlen; + item = PySequence_Fast_GET_ITEM(seq, i); + if (PyBytes_CheckExact(item)) { + /* Fast path. */ + Py_INCREF(item); + buffers[i].obj = item; + buffers[i].buf = PyBytes_AS_STRING(item); + buffers[i].len = PyBytes_GET_SIZE(item); + } else { if (PyObject_GetBuffer(item, &buffers[i], PyBUF_SIMPLE) != 0) { PyErr_Format(PyExc_TypeError, @@ -83,52 +83,52 @@ STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable) * changing the behaviour of that data race. */ drop_gil = 0; - } - nbufs = i + 1; /* for error cleanup */ - itemlen = buffers[i].len; - if (itemlen > PY_SSIZE_T_MAX - sz) { - PyErr_SetString(PyExc_OverflowError, - "join() result is too long"); - goto error; - } - sz += itemlen; - if (i != 0) { - if (seplen > PY_SSIZE_T_MAX - sz) { - PyErr_SetString(PyExc_OverflowError, - "join() result is too long"); - goto error; - } - sz += seplen; - } - if (seqlen != PySequence_Fast_GET_SIZE(seq)) { - PyErr_SetString(PyExc_RuntimeError, - "sequence changed size during iteration"); - goto error; - } - } - - /* Allocate result space. */ - res = STRINGLIB_NEW(NULL, sz); - if (res == NULL) - goto error; - - /* Catenate everything. */ - p = STRINGLIB_STR(res); + } + nbufs = i + 1; /* for error cleanup */ + itemlen = buffers[i].len; + if (itemlen > PY_SSIZE_T_MAX - sz) { + PyErr_SetString(PyExc_OverflowError, + "join() result is too long"); + goto error; + } + sz += itemlen; + if (i != 0) { + if (seplen > PY_SSIZE_T_MAX - sz) { + PyErr_SetString(PyExc_OverflowError, + "join() result is too long"); + goto error; + } + sz += seplen; + } + if (seqlen != PySequence_Fast_GET_SIZE(seq)) { + PyErr_SetString(PyExc_RuntimeError, + "sequence changed size during iteration"); + goto error; + } + } + + /* Allocate result space. */ + res = STRINGLIB_NEW(NULL, sz); + if (res == NULL) + goto error; + + /* Catenate everything. */ + p = STRINGLIB_STR(res); if (sz < GIL_THRESHOLD) { drop_gil = 0; /* Benefits are likely outweighed by the overheads */ } if (drop_gil) { save = PyEval_SaveThread(); } - if (!seplen) { - /* fast path */ - for (i = 0; i < nbufs; i++) { - Py_ssize_t n = buffers[i].len; - char *q = buffers[i].buf; - memcpy(p, q, n); - p += n; - } - } + if (!seplen) { + /* fast path */ + for (i = 0; i < nbufs; i++) { + Py_ssize_t n = buffers[i].len; + char *q = buffers[i].buf; + memcpy(p, q, n); + p += n; + } + } else { for (i = 0; i < nbufs; i++) { Py_ssize_t n; @@ -141,23 +141,23 @@ STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable) q = buffers[i].buf; memcpy(p, q, n); p += n; - } - } + } + } if (drop_gil) { PyEval_RestoreThread(save); } - goto done; - -error: - res = NULL; -done: - Py_DECREF(seq); - for (i = 0; i < nbufs; i++) - PyBuffer_Release(&buffers[i]); - if (buffers != static_buffers) - PyMem_FREE(buffers); - return res; -} - -#undef NB_STATIC_BUFFERS + goto done; + +error: + res = NULL; +done: + Py_DECREF(seq); + for (i = 0; i < nbufs; i++) + PyBuffer_Release(&buffers[i]); + if (buffers != static_buffers) + PyMem_FREE(buffers); + return res; +} + +#undef NB_STATIC_BUFFERS #undef GIL_THRESHOLD diff --git a/contrib/tools/python3/src/Objects/stringlib/localeutil.h b/contrib/tools/python3/src/Objects/stringlib/localeutil.h index de48a62f1f..bd16e0a172 100644 --- a/contrib/tools/python3/src/Objects/stringlib/localeutil.h +++ b/contrib/tools/python3/src/Objects/stringlib/localeutil.h @@ -1,82 +1,82 @@ -/* _PyUnicode_InsertThousandsGrouping() helper functions */ - -typedef struct { - const char *grouping; - char previous; - Py_ssize_t i; /* Where we're currently pointing in grouping. */ -} GroupGenerator; - - -static void -GroupGenerator_init(GroupGenerator *self, const char *grouping) -{ - self->grouping = grouping; - self->i = 0; - self->previous = 0; -} - - -/* Returns the next grouping, or 0 to signify end. */ -static Py_ssize_t -GroupGenerator_next(GroupGenerator *self) -{ - /* Note that we don't really do much error checking here. If a - grouping string contains just CHAR_MAX, for example, then just - terminate the generator. That shouldn't happen, but at least we - fail gracefully. */ - switch (self->grouping[self->i]) { - case 0: - return self->previous; - case CHAR_MAX: - /* Stop the generator. */ - return 0; - default: { - char ch = self->grouping[self->i]; - self->previous = ch; - self->i++; - return (Py_ssize_t)ch; - } - } -} - - -/* Fill in some digits, leading zeros, and thousands separator. All - are optional, depending on when we're called. */ -static void -InsertThousandsGrouping_fill(_PyUnicodeWriter *writer, Py_ssize_t *buffer_pos, - PyObject *digits, Py_ssize_t *digits_pos, - Py_ssize_t n_chars, Py_ssize_t n_zeros, - PyObject *thousands_sep, Py_ssize_t thousands_sep_len, - Py_UCS4 *maxchar) -{ - if (!writer) { - /* if maxchar > 127, maxchar is already set */ - if (*maxchar == 127 && thousands_sep) { - Py_UCS4 maxchar2 = PyUnicode_MAX_CHAR_VALUE(thousands_sep); - *maxchar = Py_MAX(*maxchar, maxchar2); - } - return; - } - - if (thousands_sep) { - *buffer_pos -= thousands_sep_len; - - /* Copy the thousands_sep chars into the buffer. */ - _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos, - thousands_sep, 0, - thousands_sep_len); - } - - *buffer_pos -= n_chars; - *digits_pos -= n_chars; - _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos, - digits, *digits_pos, - n_chars); - - if (n_zeros) { - *buffer_pos -= n_zeros; - enum PyUnicode_Kind kind = PyUnicode_KIND(writer->buffer); - void *data = PyUnicode_DATA(writer->buffer); +/* _PyUnicode_InsertThousandsGrouping() helper functions */ + +typedef struct { + const char *grouping; + char previous; + Py_ssize_t i; /* Where we're currently pointing in grouping. */ +} GroupGenerator; + + +static void +GroupGenerator_init(GroupGenerator *self, const char *grouping) +{ + self->grouping = grouping; + self->i = 0; + self->previous = 0; +} + + +/* Returns the next grouping, or 0 to signify end. */ +static Py_ssize_t +GroupGenerator_next(GroupGenerator *self) +{ + /* Note that we don't really do much error checking here. If a + grouping string contains just CHAR_MAX, for example, then just + terminate the generator. That shouldn't happen, but at least we + fail gracefully. */ + switch (self->grouping[self->i]) { + case 0: + return self->previous; + case CHAR_MAX: + /* Stop the generator. */ + return 0; + default: { + char ch = self->grouping[self->i]; + self->previous = ch; + self->i++; + return (Py_ssize_t)ch; + } + } +} + + +/* Fill in some digits, leading zeros, and thousands separator. All + are optional, depending on when we're called. */ +static void +InsertThousandsGrouping_fill(_PyUnicodeWriter *writer, Py_ssize_t *buffer_pos, + PyObject *digits, Py_ssize_t *digits_pos, + Py_ssize_t n_chars, Py_ssize_t n_zeros, + PyObject *thousands_sep, Py_ssize_t thousands_sep_len, + Py_UCS4 *maxchar) +{ + if (!writer) { + /* if maxchar > 127, maxchar is already set */ + if (*maxchar == 127 && thousands_sep) { + Py_UCS4 maxchar2 = PyUnicode_MAX_CHAR_VALUE(thousands_sep); + *maxchar = Py_MAX(*maxchar, maxchar2); + } + return; + } + + if (thousands_sep) { + *buffer_pos -= thousands_sep_len; + + /* Copy the thousands_sep chars into the buffer. */ + _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos, + thousands_sep, 0, + thousands_sep_len); + } + + *buffer_pos -= n_chars; + *digits_pos -= n_chars; + _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos, + digits, *digits_pos, + n_chars); + + if (n_zeros) { + *buffer_pos -= n_zeros; + enum PyUnicode_Kind kind = PyUnicode_KIND(writer->buffer); + void *data = PyUnicode_DATA(writer->buffer); unicode_fill(kind, data, '0', *buffer_pos, n_zeros); - } -} + } +} diff --git a/contrib/tools/python3/src/Objects/stringlib/partition.h b/contrib/tools/python3/src/Objects/stringlib/partition.h index d47ac35217..ed32a6f2b3 100644 --- a/contrib/tools/python3/src/Objects/stringlib/partition.h +++ b/contrib/tools/python3/src/Objects/stringlib/partition.h @@ -1,116 +1,116 @@ -/* stringlib: partition implementation */ - -#ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module -#endif - -Py_LOCAL_INLINE(PyObject*) -STRINGLIB(partition)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - PyObject* sep_obj, - const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) -{ - PyObject* out; - Py_ssize_t pos; - - if (sep_len == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - - out = PyTuple_New(3); - if (!out) - return NULL; - - pos = FASTSEARCH(str, str_len, sep, sep_len, -1, FAST_SEARCH); - - if (pos < 0) { -#if STRINGLIB_MUTABLE - PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len)); - PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); - PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0)); - - if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; - } -#else - Py_INCREF(str_obj); - PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY); -#endif - return out; - } - - PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); - Py_INCREF(sep_obj); - PyTuple_SET_ITEM(out, 1, sep_obj); - pos += sep_len; - PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); - - if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; - } - - return out; -} - -Py_LOCAL_INLINE(PyObject*) -STRINGLIB(rpartition)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - PyObject* sep_obj, - const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) -{ - PyObject* out; - Py_ssize_t pos; - - if (sep_len == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - - out = PyTuple_New(3); - if (!out) - return NULL; - - pos = FASTSEARCH(str, str_len, sep, sep_len, -1, FAST_RSEARCH); - - if (pos < 0) { -#if STRINGLIB_MUTABLE - PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0)); - PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); - PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len)); - - if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; - } -#else - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY); - Py_INCREF(STRINGLIB_EMPTY); - PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); - Py_INCREF(str_obj); - PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj); -#endif - return out; - } - - PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); - Py_INCREF(sep_obj); - PyTuple_SET_ITEM(out, 1, sep_obj); - pos += sep_len; - PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); - - if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; - } - - return out; -} - +/* stringlib: partition implementation */ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +Py_LOCAL_INLINE(PyObject*) +STRINGLIB(partition)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + PyObject* sep_obj, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) +{ + PyObject* out; + Py_ssize_t pos; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + + out = PyTuple_New(3); + if (!out) + return NULL; + + pos = FASTSEARCH(str, str_len, sep, sep_len, -1, FAST_SEARCH); + + if (pos < 0) { +#if STRINGLIB_MUTABLE + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len)); + PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0)); + + if (PyErr_Occurred()) { + Py_DECREF(out); + return NULL; + } +#else + Py_INCREF(str_obj); + PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY); +#endif + return out; + } + + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); + Py_INCREF(sep_obj); + PyTuple_SET_ITEM(out, 1, sep_obj); + pos += sep_len; + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); + + if (PyErr_Occurred()) { + Py_DECREF(out); + return NULL; + } + + return out; +} + +Py_LOCAL_INLINE(PyObject*) +STRINGLIB(rpartition)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + PyObject* sep_obj, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) +{ + PyObject* out; + Py_ssize_t pos; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + + out = PyTuple_New(3); + if (!out) + return NULL; + + pos = FASTSEARCH(str, str_len, sep, sep_len, -1, FAST_RSEARCH); + + if (pos < 0) { +#if STRINGLIB_MUTABLE + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len)); + + if (PyErr_Occurred()) { + Py_DECREF(out); + return NULL; + } +#else + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY); + Py_INCREF(STRINGLIB_EMPTY); + PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); + Py_INCREF(str_obj); + PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj); +#endif + return out; + } + + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos)); + Py_INCREF(sep_obj); + PyTuple_SET_ITEM(out, 1, sep_obj); + pos += sep_len; + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos)); + + if (PyErr_Occurred()) { + Py_DECREF(out); + return NULL; + } + + return out; +} + diff --git a/contrib/tools/python3/src/Objects/stringlib/replace.h b/contrib/tools/python3/src/Objects/stringlib/replace.h index dcd2fb8ff4..ef318ed6dd 100644 --- a/contrib/tools/python3/src/Objects/stringlib/replace.h +++ b/contrib/tools/python3/src/Objects/stringlib/replace.h @@ -1,53 +1,53 @@ -/* stringlib: replace implementation */ - -#ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module -#endif - -Py_LOCAL_INLINE(void) -STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end, - Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) -{ - *s = u2; - while (--maxcount && ++s != end) { - /* Find the next character to be replaced. - - If it occurs often, it is faster to scan for it using an inline - loop. If it occurs seldom, it is faster to scan for it using a - function call; the overhead of the function call is amortized - across the many characters that call covers. We start with an - inline loop and use a heuristic to determine whether to fall back - to a function call. */ - if (*s != u1) { - int attempts = 10; - /* search u1 in a dummy loop */ - while (1) { - if (++s == end) - return; - if (*s == u1) - break; - if (!--attempts) { - /* if u1 was not found for attempts iterations, - use FASTSEARCH() or memchr() */ -#if STRINGLIB_SIZEOF_CHAR == 1 - s++; - s = memchr(s, u1, end - s); - if (s == NULL) - return; -#else - Py_ssize_t i; - STRINGLIB_CHAR ch1 = (STRINGLIB_CHAR) u1; - s++; - i = FASTSEARCH(s, end - s, &ch1, 1, 0, FAST_SEARCH); - if (i < 0) - return; - s += i; -#endif - /* restart the dummy loop */ - break; - } - } - } - *s = u2; - } -} +/* stringlib: replace implementation */ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +Py_LOCAL_INLINE(void) +STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end, + Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) +{ + *s = u2; + while (--maxcount && ++s != end) { + /* Find the next character to be replaced. + + If it occurs often, it is faster to scan for it using an inline + loop. If it occurs seldom, it is faster to scan for it using a + function call; the overhead of the function call is amortized + across the many characters that call covers. We start with an + inline loop and use a heuristic to determine whether to fall back + to a function call. */ + if (*s != u1) { + int attempts = 10; + /* search u1 in a dummy loop */ + while (1) { + if (++s == end) + return; + if (*s == u1) + break; + if (!--attempts) { + /* if u1 was not found for attempts iterations, + use FASTSEARCH() or memchr() */ +#if STRINGLIB_SIZEOF_CHAR == 1 + s++; + s = memchr(s, u1, end - s); + if (s == NULL) + return; +#else + Py_ssize_t i; + STRINGLIB_CHAR ch1 = (STRINGLIB_CHAR) u1; + s++; + i = FASTSEARCH(s, end - s, &ch1, 1, 0, FAST_SEARCH); + if (i < 0) + return; + s += i; +#endif + /* restart the dummy loop */ + break; + } + } + } + *s = u2; + } +} diff --git a/contrib/tools/python3/src/Objects/stringlib/split.h b/contrib/tools/python3/src/Objects/stringlib/split.h index 23666a8a85..068047f987 100644 --- a/contrib/tools/python3/src/Objects/stringlib/split.h +++ b/contrib/tools/python3/src/Objects/stringlib/split.h @@ -1,390 +1,390 @@ -/* stringlib: split implementation */ - -#ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module -#endif - -/* Overallocate the initial list to reduce the number of reallocs for small - split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three - resizes, to sizes 4, 8, then 16. Most observed string splits are for human - text (roughly 11 words per line) and field delimited data (usually 1-10 - fields). For large strings the split algorithms are bandwidth limited - so increasing the preallocation likely will not improve things.*/ - -#define MAX_PREALLOC 12 - -/* 5 splits gives 6 elements */ -#define PREALLOC_SIZE(maxsplit) \ - (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) - -#define SPLIT_APPEND(data, left, right) \ - sub = STRINGLIB_NEW((data) + (left), \ - (right) - (left)); \ - if (sub == NULL) \ - goto onError; \ - if (PyList_Append(list, sub)) { \ - Py_DECREF(sub); \ - goto onError; \ - } \ - else \ - Py_DECREF(sub); - -#define SPLIT_ADD(data, left, right) { \ - sub = STRINGLIB_NEW((data) + (left), \ - (right) - (left)); \ - if (sub == NULL) \ - goto onError; \ - if (count < MAX_PREALLOC) { \ - PyList_SET_ITEM(list, count, sub); \ - } else { \ - if (PyList_Append(list, sub)) { \ - Py_DECREF(sub); \ - goto onError; \ - } \ - else \ - Py_DECREF(sub); \ - } \ - count++; } - - -/* Always force the list to the expected size. */ +/* stringlib: split implementation */ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +/* Overallocate the initial list to reduce the number of reallocs for small + split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three + resizes, to sizes 4, 8, then 16. Most observed string splits are for human + text (roughly 11 words per line) and field delimited data (usually 1-10 + fields). For large strings the split algorithms are bandwidth limited + so increasing the preallocation likely will not improve things.*/ + +#define MAX_PREALLOC 12 + +/* 5 splits gives 6 elements */ +#define PREALLOC_SIZE(maxsplit) \ + (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) + +#define SPLIT_APPEND(data, left, right) \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); + +#define SPLIT_ADD(data, left, right) { \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (count < MAX_PREALLOC) { \ + PyList_SET_ITEM(list, count, sub); \ + } else { \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); \ + } \ + count++; } + + +/* Always force the list to the expected size. */ #define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count) - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(split_whitespace)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) -{ - Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - PyObject *sub; - - if (list == NULL) - return NULL; - - i = j = 0; - while (maxcount-- > 0) { - while (i < str_len && STRINGLIB_ISSPACE(str[i])) - i++; - if (i == str_len) break; - j = i; i++; - while (i < str_len && !STRINGLIB_ISSPACE(str[i])) - i++; -#ifndef STRINGLIB_MUTABLE - if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No whitespace in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - break; - } -#endif - SPLIT_ADD(str, j, i); - } - - if (i < str_len) { - /* Only occurs when maxcount was reached */ - /* Skip any remaining whitespace and copy to end of string */ - while (i < str_len && STRINGLIB_ISSPACE(str[i])) - i++; - if (i != str_len) - SPLIT_ADD(str, i, str_len); - } - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(split_char)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) -{ - Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - PyObject *sub; - - if (list == NULL) - return NULL; - - i = j = 0; - while ((j < str_len) && (maxcount-- > 0)) { - for(; j < str_len; j++) { - /* I found that using memchr makes no difference */ - if (str[j] == ch) { - SPLIT_ADD(str, i, j); - i = j = j + 1; - break; - } - } - } -#ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* ch not in str_obj, so just use str_obj as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - } else -#endif - if (i <= str_len) { - SPLIT_ADD(str, i, str_len); - } - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(split)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) -{ - Py_ssize_t i, j, pos, count=0; - PyObject *list, *sub; - - if (sep_len == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else if (sep_len == 1) - return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); - - list = PyList_New(PREALLOC_SIZE(maxcount)); - if (list == NULL) - return NULL; - - i = j = 0; - while (maxcount-- > 0) { - pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); - if (pos < 0) - break; - j = i + pos; - SPLIT_ADD(str, i, j); - i = j + sep_len; - } -#ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No match in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - } else -#endif - { - SPLIT_ADD(str, i, str_len); - } - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(rsplit_whitespace)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) -{ - Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - PyObject *sub; - - if (list == NULL) - return NULL; - - i = j = str_len - 1; - while (maxcount-- > 0) { - while (i >= 0 && STRINGLIB_ISSPACE(str[i])) - i--; - if (i < 0) break; - j = i; i--; - while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) - i--; -#ifndef STRINGLIB_MUTABLE - if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No whitespace in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - break; - } -#endif - SPLIT_ADD(str, i + 1, j + 1); - } - - if (i >= 0) { - /* Only occurs when maxcount was reached */ - /* Skip any remaining whitespace and copy to beginning of string */ - while (i >= 0 && STRINGLIB_ISSPACE(str[i])) - i--; - if (i >= 0) - SPLIT_ADD(str, 0, i + 1); - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(rsplit_char)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) -{ - Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - PyObject *sub; - - if (list == NULL) - return NULL; - - i = j = str_len - 1; - while ((i >= 0) && (maxcount-- > 0)) { - for(; i >= 0; i--) { - if (str[i] == ch) { - SPLIT_ADD(str, i + 1, j + 1); - j = i = i - 1; - break; - } - } - } -#ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* ch not in str_obj, so just use str_obj as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - } else -#endif - if (j >= -1) { - SPLIT_ADD(str, 0, j + 1); - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(rsplit)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) -{ - Py_ssize_t j, pos, count=0; - PyObject *list, *sub; - - if (sep_len == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else if (sep_len == 1) - return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); - - list = PyList_New(PREALLOC_SIZE(maxcount)); - if (list == NULL) - return NULL; - - j = str_len; - while (maxcount-- > 0) { - pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); - if (pos < 0) - break; - SPLIT_ADD(str, pos + sep_len, j); - j = pos; - } -#ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No match in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - } else -#endif - { - SPLIT_ADD(str, 0, j); - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(splitlines)(PyObject* str_obj, - const STRINGLIB_CHAR* str, Py_ssize_t str_len, - int keepends) -{ - /* This does not use the preallocated list because splitlines is - usually run with hundreds of newlines. The overhead of - switching between PyList_SET_ITEM and append causes about a - 2-3% slowdown for that common case. A smarter implementation - could move the if check out, so the SET_ITEMs are done first - and the appends only done when the prealloc buffer is full. - That's too much work for little gain.*/ - - Py_ssize_t i; - Py_ssize_t j; - PyObject *list = PyList_New(0); - PyObject *sub; - - if (list == NULL) - return NULL; - - for (i = j = 0; i < str_len; ) { - Py_ssize_t eol; - - /* Find a line and append it */ - while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) - i++; - - /* Skip the line break reading CRLF as one line break */ - eol = i; - if (i < str_len) { - if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') - i += 2; - else - i++; - if (keepends) - eol = i; - } -#ifndef STRINGLIB_MUTABLE - if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No linebreak in str_obj, so just use it as list[0] */ - if (PyList_Append(list, str_obj)) - goto onError; - break; - } -#endif - SPLIT_APPEND(str, j, eol); - j = i; - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(split_whitespace)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i == str_len) break; + j = i; i++; + while (i < str_len && !STRINGLIB_ISSPACE(str[i])) + i++; +#ifndef STRINGLIB_MUTABLE + if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, j, i); + } + + if (i < str_len) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to end of string */ + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i != str_len) + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(split_char)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while ((j < str_len) && (maxcount-- > 0)) { + for(; j < str_len; j++) { + /* I found that using memchr makes no difference */ + if (str[j] == ch) { + SPLIT_ADD(str, i, j); + i = j = j + 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (i <= str_len) { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(split)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + if (pos < 0) + break; + j = i + pos; + SPLIT_ADD(str, i, j); + i = j + sep_len; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(rsplit_whitespace)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while (maxcount-- > 0) { + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i < 0) break; + j = i; i--; + while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) + i--; +#ifndef STRINGLIB_MUTABLE + if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, i + 1, j + 1); + } + + if (i >= 0) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to beginning of string */ + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i >= 0) + SPLIT_ADD(str, 0, i + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(rsplit_char)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while ((i >= 0) && (maxcount-- > 0)) { + for(; i >= 0; i--) { + if (str[i] == ch) { + SPLIT_ADD(str, i + 1, j + 1); + j = i = i - 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (j >= -1) { + SPLIT_ADD(str, 0, j + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(rsplit)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + j = str_len; + while (maxcount-- > 0) { + pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); + if (pos < 0) + break; + SPLIT_ADD(str, pos + sep_len, j); + j = pos; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, 0, j); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(splitlines)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + int keepends) +{ + /* This does not use the preallocated list because splitlines is + usually run with hundreds of newlines. The overhead of + switching between PyList_SET_ITEM and append causes about a + 2-3% slowdown for that common case. A smarter implementation + could move the if check out, so the SET_ITEMs are done first + and the appends only done when the prealloc buffer is full. + That's too much work for little gain.*/ + + Py_ssize_t i; + Py_ssize_t j; + PyObject *list = PyList_New(0); + PyObject *sub; + + if (list == NULL) + return NULL; + + for (i = j = 0; i < str_len; ) { + Py_ssize_t eol; + + /* Find a line and append it */ + while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) + i++; + + /* Skip the line break reading CRLF as one line break */ + eol = i; + if (i < str_len) { + if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') + i += 2; + else + i++; + if (keepends) + eol = i; + } +#ifndef STRINGLIB_MUTABLE + if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No linebreak in str_obj, so just use it as list[0] */ + if (PyList_Append(list, str_obj)) + goto onError; + break; + } +#endif + SPLIT_APPEND(str, j, eol); + j = i; + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + diff --git a/contrib/tools/python3/src/Objects/stringlib/stringdefs.h b/contrib/tools/python3/src/Objects/stringlib/stringdefs.h index 7628f538c9..ce27f3e408 100644 --- a/contrib/tools/python3/src/Objects/stringlib/stringdefs.h +++ b/contrib/tools/python3/src/Objects/stringlib/stringdefs.h @@ -1,28 +1,28 @@ -#ifndef STRINGLIB_STRINGDEFS_H -#define STRINGLIB_STRINGDEFS_H - -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 0 - -#define FASTSEARCH fastsearch -#define STRINGLIB(F) stringlib_##F -#define STRINGLIB_OBJECT PyBytesObject -#define STRINGLIB_SIZEOF_CHAR 1 -#define STRINGLIB_CHAR char -#define STRINGLIB_TYPE_NAME "string" -#define STRINGLIB_PARSE_CODE "S" -#define STRINGLIB_EMPTY nullstring -#define STRINGLIB_ISSPACE Py_ISSPACE -#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) -#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) -#define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) -#define STRINGLIB_STR PyBytes_AS_STRING -#define STRINGLIB_LEN PyBytes_GET_SIZE -#define STRINGLIB_NEW PyBytes_FromStringAndSize -#define STRINGLIB_CHECK PyBytes_Check -#define STRINGLIB_CHECK_EXACT PyBytes_CheckExact -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_Repr -#endif /* !STRINGLIB_STRINGDEFS_H */ +#ifndef STRINGLIB_STRINGDEFS_H +#define STRINGLIB_STRINGDEFS_H + +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 0 + +#define FASTSEARCH fastsearch +#define STRINGLIB(F) stringlib_##F +#define STRINGLIB_OBJECT PyBytesObject +#define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_CHAR char +#define STRINGLIB_TYPE_NAME "string" +#define STRINGLIB_PARSE_CODE "S" +#define STRINGLIB_EMPTY nullstring +#define STRINGLIB_ISSPACE Py_ISSPACE +#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) +#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) +#define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) +#define STRINGLIB_STR PyBytes_AS_STRING +#define STRINGLIB_LEN PyBytes_GET_SIZE +#define STRINGLIB_NEW PyBytes_FromStringAndSize +#define STRINGLIB_CHECK PyBytes_Check +#define STRINGLIB_CHECK_EXACT PyBytes_CheckExact +#define STRINGLIB_TOSTR PyObject_Str +#define STRINGLIB_TOASCII PyObject_Repr +#endif /* !STRINGLIB_STRINGDEFS_H */ diff --git a/contrib/tools/python3/src/Objects/stringlib/transmogrify.h b/contrib/tools/python3/src/Objects/stringlib/transmogrify.h index 33c6abba09..e1165ea38e 100644 --- a/contrib/tools/python3/src/Objects/stringlib/transmogrify.h +++ b/contrib/tools/python3/src/Objects/stringlib/transmogrify.h @@ -1,10 +1,10 @@ -#if STRINGLIB_IS_UNICODE -# error "transmogrify.h only compatible with byte-wise strings" -#endif - -/* the more complicated methods. parts of these should be pulled out into the - shared code in bytes_methods.c to cut down on duplicate code bloat. */ - +#if STRINGLIB_IS_UNICODE +# error "transmogrify.h only compatible with byte-wise strings" +#endif + +/* the more complicated methods. parts of these should be pulled out into the + shared code in bytes_methods.c to cut down on duplicate code bloat. */ + /*[clinic input] class B "PyObject *" "&PyType_Type" [clinic start generated code]*/ @@ -12,18 +12,18 @@ class B "PyObject *" "&PyType_Type" #include "clinic/transmogrify.h.h" -static inline PyObject * -return_self(PyObject *self) -{ -#if !STRINGLIB_MUTABLE - if (STRINGLIB_CHECK_EXACT(self)) { - Py_INCREF(self); - return self; - } -#endif - return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - +static inline PyObject * +return_self(PyObject *self) +{ +#if !STRINGLIB_MUTABLE + if (STRINGLIB_CHECK_EXACT(self)) { + Py_INCREF(self); + return self; + } +#endif + return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +} + /*[clinic input] B.expandtabs as stringlib_expandtabs @@ -37,100 +37,100 @@ If tabsize is not given, a tab size of 8 characters is assumed. static PyObject * stringlib_expandtabs_impl(PyObject *self, int tabsize) /*[clinic end generated code: output=069cb7fae72e4c2b input=3c6d3b12aa3ccbea]*/ -{ - const char *e, *p; - char *q; - Py_ssize_t i, j; - PyObject *u; - - /* First pass: determine size of output string */ - i = j = 0; - e = STRINGLIB_STR(self) + STRINGLIB_LEN(self); - for (p = STRINGLIB_STR(self); p < e; p++) { - if (*p == '\t') { - if (tabsize > 0) { - Py_ssize_t incr = tabsize - (j % tabsize); - if (j > PY_SSIZE_T_MAX - incr) - goto overflow; - j += incr; - } - } - else { - if (j > PY_SSIZE_T_MAX - 1) - goto overflow; - j++; - if (*p == '\n' || *p == '\r') { - if (i > PY_SSIZE_T_MAX - j) - goto overflow; - i += j; - j = 0; - } - } - } - - if (i > PY_SSIZE_T_MAX - j) - goto overflow; - - /* Second pass: create output string and fill it */ - u = STRINGLIB_NEW(NULL, i + j); - if (!u) - return NULL; - - j = 0; - q = STRINGLIB_STR(u); - - for (p = STRINGLIB_STR(self); p < e; p++) { - if (*p == '\t') { - if (tabsize > 0) { - i = tabsize - (j % tabsize); - j += i; - while (i--) - *q++ = ' '; - } - } - else { - j++; - *q++ = *p; - if (*p == '\n' || *p == '\r') - j = 0; - } - } - - return u; - overflow: - PyErr_SetString(PyExc_OverflowError, "result too long"); - return NULL; -} - -static inline PyObject * -pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill) -{ - PyObject *u; - - if (left < 0) - left = 0; - if (right < 0) - right = 0; - - if (left == 0 && right == 0) { - return return_self(self); - } - - u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right); - if (u) { - if (left) - memset(STRINGLIB_STR(u), fill, left); - memcpy(STRINGLIB_STR(u) + left, - STRINGLIB_STR(self), - STRINGLIB_LEN(self)); - if (right) - memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self), - fill, right); - } - - return u; -} - +{ + const char *e, *p; + char *q; + Py_ssize_t i, j; + PyObject *u; + + /* First pass: determine size of output string */ + i = j = 0; + e = STRINGLIB_STR(self) + STRINGLIB_LEN(self); + for (p = STRINGLIB_STR(self); p < e; p++) { + if (*p == '\t') { + if (tabsize > 0) { + Py_ssize_t incr = tabsize - (j % tabsize); + if (j > PY_SSIZE_T_MAX - incr) + goto overflow; + j += incr; + } + } + else { + if (j > PY_SSIZE_T_MAX - 1) + goto overflow; + j++; + if (*p == '\n' || *p == '\r') { + if (i > PY_SSIZE_T_MAX - j) + goto overflow; + i += j; + j = 0; + } + } + } + + if (i > PY_SSIZE_T_MAX - j) + goto overflow; + + /* Second pass: create output string and fill it */ + u = STRINGLIB_NEW(NULL, i + j); + if (!u) + return NULL; + + j = 0; + q = STRINGLIB_STR(u); + + for (p = STRINGLIB_STR(self); p < e; p++) { + if (*p == '\t') { + if (tabsize > 0) { + i = tabsize - (j % tabsize); + j += i; + while (i--) + *q++ = ' '; + } + } + else { + j++; + *q++ = *p; + if (*p == '\n' || *p == '\r') + j = 0; + } + } + + return u; + overflow: + PyErr_SetString(PyExc_OverflowError, "result too long"); + return NULL; +} + +static inline PyObject * +pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill) +{ + PyObject *u; + + if (left < 0) + left = 0; + if (right < 0) + right = 0; + + if (left == 0 && right == 0) { + return return_self(self); + } + + u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right); + if (u) { + if (left) + memset(STRINGLIB_STR(u), fill, left); + memcpy(STRINGLIB_STR(u) + left, + STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + if (right) + memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self), + fill, right); + } + + return u; +} + /*[clinic input] B.ljust as stringlib_ljust @@ -143,18 +143,18 @@ Return a left-justified string of length width. Padding is done using the specified fill character. [clinic start generated code]*/ -static PyObject * +static PyObject * stringlib_ljust_impl(PyObject *self, Py_ssize_t width, char fillchar) /*[clinic end generated code: output=c79ca173c5ff8337 input=eff2d014bc7d80df]*/ -{ - if (STRINGLIB_LEN(self) >= width) { - return return_self(self); - } - - return pad(self, 0, width - STRINGLIB_LEN(self), fillchar); -} - - +{ + if (STRINGLIB_LEN(self) >= width) { + return return_self(self); + } + + return pad(self, 0, width - STRINGLIB_LEN(self), fillchar); +} + + /*[clinic input] B.rjust as stringlib_rjust @@ -167,18 +167,18 @@ Return a right-justified string of length width. Padding is done using the specified fill character. [clinic start generated code]*/ -static PyObject * +static PyObject * stringlib_rjust_impl(PyObject *self, Py_ssize_t width, char fillchar) /*[clinic end generated code: output=7df5d728a5439570 input=218b0bd31308955d]*/ -{ - if (STRINGLIB_LEN(self) >= width) { - return return_self(self); - } - - return pad(self, width - STRINGLIB_LEN(self), 0, fillchar); -} - - +{ + if (STRINGLIB_LEN(self) >= width) { + return return_self(self); + } + + return pad(self, width - STRINGLIB_LEN(self), 0, fillchar); +} + + /*[clinic input] B.center as stringlib_center @@ -191,22 +191,22 @@ Return a centered string of length width. Padding is done using the specified fill character. [clinic start generated code]*/ -static PyObject * +static PyObject * stringlib_center_impl(PyObject *self, Py_ssize_t width, char fillchar) /*[clinic end generated code: output=d8da2e055288b4c2 input=3776fd278765d89b]*/ -{ - Py_ssize_t marg, left; - - if (STRINGLIB_LEN(self) >= width) { - return return_self(self); - } - - marg = width - STRINGLIB_LEN(self); - left = marg / 2 + (marg & width & 1); - - return pad(self, left, marg - left, fillchar); -} - +{ + Py_ssize_t marg, left; + + if (STRINGLIB_LEN(self) >= width) { + return return_self(self); + } + + marg = width - STRINGLIB_LEN(self); + left = marg / 2 + (marg & width & 1); + + return pad(self, left, marg - left, fillchar); +} + /*[clinic input] B.zfill as stringlib_zfill @@ -218,523 +218,523 @@ Pad a numeric string with zeros on the left, to fill a field of the given width. The original string is never truncated. [clinic start generated code]*/ -static PyObject * +static PyObject * stringlib_zfill_impl(PyObject *self, Py_ssize_t width) /*[clinic end generated code: output=0b3c684a7f1b2319 input=2da6d7b8e9bcb19a]*/ -{ - Py_ssize_t fill; - PyObject *s; - char *p; - - if (STRINGLIB_LEN(self) >= width) { - return return_self(self); - } - - fill = width - STRINGLIB_LEN(self); - - s = pad(self, fill, 0, '0'); - - if (s == NULL) - return NULL; - - p = STRINGLIB_STR(s); - if (p[fill] == '+' || p[fill] == '-') { - /* move sign to beginning of string */ - p[0] = p[fill]; - p[fill] = '0'; - } - - return s; -} - - -/* find and count characters and substrings */ - -#define findchar(target, target_len, c) \ - ((char *)memchr((const void *)(target), c, target_len)) - - -static Py_ssize_t -countchar(const char *target, Py_ssize_t target_len, char c, - Py_ssize_t maxcount) -{ - Py_ssize_t count = 0; - const char *start = target; - const char *end = target + target_len; - - while ((start = findchar(start, end - start, c)) != NULL) { - count++; - if (count >= maxcount) - break; - start += 1; - } - return count; -} - - -/* Algorithms for different cases of string replacement */ - -/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ -static PyObject * -stringlib_replace_interleave(PyObject *self, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, i; - PyObject *result; - - self_len = STRINGLIB_LEN(self); - - /* 1 at the end plus 1 after every character; - count = min(maxcount, self_len + 1) */ - if (maxcount <= self_len) { - count = maxcount; - } - else { - /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */ - count = self_len + 1; - } - - /* Check for overflow */ - /* result_len = count * to_len + self_len; */ - assert(count > 0); - if (to_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, - "replace bytes is too long"); - return NULL; - } - result_len = count * to_len + self_len; - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - - self_s = STRINGLIB_STR(self); - result_s = STRINGLIB_STR(result); - - if (to_len > 1) { - /* Lay the first one down (guaranteed this will occur) */ - memcpy(result_s, to_s, to_len); - result_s += to_len; - count -= 1; - - for (i = 0; i < count; i++) { - *result_s++ = *self_s++; - memcpy(result_s, to_s, to_len); - result_s += to_len; - } - } - else { - result_s[0] = to_s[0]; - result_s += to_len; - count -= 1; - for (i = 0; i < count; i++) { - *result_s++ = *self_s++; - result_s[0] = to_s[0]; - result_s += to_len; - } - } - - /* Copy the rest of the original string */ - memcpy(result_s, self_s, self_len - i); - - return result; -} - -/* Special case for deleting a single character */ -/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ -static PyObject * -stringlib_replace_delete_single_character(PyObject *self, - char from_c, Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyObject *result; - - self_len = STRINGLIB_LEN(self); - self_s = STRINGLIB_STR(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - return return_self(self); - } - - result_len = self_len - count; /* from_len == 1 */ - assert(result_len>=0); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end - start, from_c); - if (next == NULL) - break; - memcpy(result_s, start, next - start); - result_s += (next - start); - start = next + 1; - } - memcpy(result_s, start, end - start); - - return result; -} - -/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ - -static PyObject * -stringlib_replace_delete_substring(PyObject *self, - const char *from_s, Py_ssize_t from_len, - Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyObject *result; - - self_len = STRINGLIB_LEN(self); - self_s = STRINGLIB_STR(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches */ - return return_self(self); - } - - result_len = self_len - (count * from_len); - assert (result_len>=0); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end - start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start + offset; - - memcpy(result_s, start, next - start); - - result_s += (next - start); - start = next + from_len; - } - memcpy(result_s, start, end - start); - return result; -} - -/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ -static PyObject * -stringlib_replace_single_character_in_place(PyObject *self, - char from_c, char to_c, - Py_ssize_t maxcount) -{ - const char *self_s, *end; - char *result_s, *start, *next; - Py_ssize_t self_len; - PyObject *result; - - /* The result string will be the same size */ - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - next = findchar(self_s, self_len, from_c); - - if (next == NULL) { - /* No matches; return the original bytes */ - return return_self(self); - } - - /* Need to make a new bytes */ - result = STRINGLIB_NEW(NULL, self_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - memcpy(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + (next - self_s); - *start = to_c; - start++; - end = result_s + self_len; - - while (--maxcount > 0) { - next = findchar(start, end - start, from_c); - if (next == NULL) - break; - *next = to_c; - start = next + 1; - } - - return result; -} - -/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ -static PyObject * -stringlib_replace_substring_in_place(PyObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s, *end; - char *result_s, *start; - Py_ssize_t self_len, offset; - PyObject *result; - - /* The result bytes will be the same size */ - - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - offset = stringlib_find(self_s, self_len, - from_s, from_len, - 0); - if (offset == -1) { - /* No matches; return the original bytes */ - return return_self(self); - } - - /* Need to make a new bytes */ - result = STRINGLIB_NEW(NULL, self_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - memcpy(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + offset; - memcpy(start, to_s, from_len); - start += from_len; - end = result_s + self_len; - - while ( --maxcount > 0) { - offset = stringlib_find(start, end - start, - from_s, from_len, - 0); - if (offset == -1) - break; - memcpy(start + offset, to_s, from_len); - start += offset + from_len; - } - - return result; -} - -/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ -static PyObject * -stringlib_replace_single_character(PyObject *self, - char from_c, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyObject *result; - - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* use the difference between current and new, hence the "-1" */ - /* result_len = self_len + count * (to_len-1) */ - assert(count > 0); - if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); - return NULL; - } - result_len = self_len + count * (to_len - 1); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end - start, from_c); - if (next == NULL) - break; - - if (next == start) { - /* replace with the 'to' */ - memcpy(result_s, to_s, to_len); - result_s += to_len; - start += 1; - } else { - /* copy the unchanged old then the 'to' */ - memcpy(result_s, start, next - start); - result_s += (next - start); - memcpy(result_s, to_s, to_len); - result_s += to_len; - start = next + 1; - } - } - /* Copy the remainder of the remaining bytes */ - memcpy(result_s, start, end - start); - - return result; -} - -/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ -static PyObject * -stringlib_replace_substring(PyObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyObject *result; - - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* Check for overflow */ - /* result_len = self_len + count * (to_len-from_len) */ - assert(count > 0); - if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); - return NULL; - } - result_len = self_len + count * (to_len - from_len); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end - start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start + offset; - if (next == start) { - /* replace with the 'to' */ - memcpy(result_s, to_s, to_len); - result_s += to_len; - start += from_len; - } else { - /* copy the unchanged old then the 'to' */ - memcpy(result_s, start, next - start); - result_s += (next - start); - memcpy(result_s, to_s, to_len); - result_s += to_len; - start = next + from_len; - } - } - /* Copy the remainder of the remaining bytes */ - memcpy(result_s, start, end - start); - - return result; -} - - -static PyObject * -stringlib_replace(PyObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ +{ + Py_ssize_t fill; + PyObject *s; + char *p; + + if (STRINGLIB_LEN(self) >= width) { + return return_self(self); + } + + fill = width - STRINGLIB_LEN(self); + + s = pad(self, fill, 0, '0'); + + if (s == NULL) + return NULL; + + p = STRINGLIB_STR(s); + if (p[fill] == '+' || p[fill] == '-') { + /* move sign to beginning of string */ + p[0] = p[fill]; + p[fill] = '0'; + } + + return s; +} + + +/* find and count characters and substrings */ + +#define findchar(target, target_len, c) \ + ((char *)memchr((const void *)(target), c, target_len)) + + +static Py_ssize_t +countchar(const char *target, Py_ssize_t target_len, char c, + Py_ssize_t maxcount) +{ + Py_ssize_t count = 0; + const char *start = target; + const char *end = target + target_len; + + while ((start = findchar(start, end - start, c)) != NULL) { + count++; + if (count >= maxcount) + break; + start += 1; + } + return count; +} + + +/* Algorithms for different cases of string replacement */ + +/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ +static PyObject * +stringlib_replace_interleave(PyObject *self, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count, i; + PyObject *result; + + self_len = STRINGLIB_LEN(self); + + /* 1 at the end plus 1 after every character; + count = min(maxcount, self_len + 1) */ + if (maxcount <= self_len) { + count = maxcount; + } + else { + /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */ + count = self_len + 1; + } + + /* Check for overflow */ + /* result_len = count * to_len + self_len; */ + assert(count > 0); + if (to_len > (PY_SSIZE_T_MAX - self_len) / count) { + PyErr_SetString(PyExc_OverflowError, + "replace bytes is too long"); + return NULL; + } + result_len = count * to_len + self_len; + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + + self_s = STRINGLIB_STR(self); + result_s = STRINGLIB_STR(result); + + if (to_len > 1) { + /* Lay the first one down (guaranteed this will occur) */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + count -= 1; + + for (i = 0; i < count; i++) { + *result_s++ = *self_s++; + memcpy(result_s, to_s, to_len); + result_s += to_len; + } + } + else { + result_s[0] = to_s[0]; + result_s += to_len; + count -= 1; + for (i = 0; i < count; i++) { + *result_s++ = *self_s++; + result_s[0] = to_s[0]; + result_s += to_len; + } + } + + /* Copy the rest of the original string */ + memcpy(result_s, self_s, self_len - i); + + return result; +} + +/* Special case for deleting a single character */ +/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ +static PyObject * +stringlib_replace_delete_single_character(PyObject *self, + char from_c, Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count; + PyObject *result; + + self_len = STRINGLIB_LEN(self); + self_s = STRINGLIB_STR(self); + + count = countchar(self_s, self_len, from_c, maxcount); + if (count == 0) { + return return_self(self); + } + + result_len = self_len - count; /* from_len == 1 */ + assert(result_len>=0); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end - start, from_c); + if (next == NULL) + break; + memcpy(result_s, start, next - start); + result_s += (next - start); + start = next + 1; + } + memcpy(result_s, start, end - start); + + return result; +} + +/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ + +static PyObject * +stringlib_replace_delete_substring(PyObject *self, + const char *from_s, Py_ssize_t from_len, + Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count, offset; + PyObject *result; + + self_len = STRINGLIB_LEN(self); + self_s = STRINGLIB_STR(self); + + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); + + if (count == 0) { + /* no matches */ + return return_self(self); + } + + result_len = self_len - (count * from_len); + assert (result_len>=0); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = stringlib_find(start, end - start, + from_s, from_len, + 0); + if (offset == -1) + break; + next = start + offset; + + memcpy(result_s, start, next - start); + + result_s += (next - start); + start = next + from_len; + } + memcpy(result_s, start, end - start); + return result; +} + +/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ +static PyObject * +stringlib_replace_single_character_in_place(PyObject *self, + char from_c, char to_c, + Py_ssize_t maxcount) +{ + const char *self_s, *end; + char *result_s, *start, *next; + Py_ssize_t self_len; + PyObject *result; + + /* The result string will be the same size */ + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + next = findchar(self_s, self_len, from_c); + + if (next == NULL) { + /* No matches; return the original bytes */ + return return_self(self); + } + + /* Need to make a new bytes */ + result = STRINGLIB_NEW(NULL, self_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + memcpy(result_s, self_s, self_len); + + /* change everything in-place, starting with this one */ + start = result_s + (next - self_s); + *start = to_c; + start++; + end = result_s + self_len; + + while (--maxcount > 0) { + next = findchar(start, end - start, from_c); + if (next == NULL) + break; + *next = to_c; + start = next + 1; + } + + return result; +} + +/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ +static PyObject * +stringlib_replace_substring_in_place(PyObject *self, + const char *from_s, Py_ssize_t from_len, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s, *end; + char *result_s, *start; + Py_ssize_t self_len, offset; + PyObject *result; + + /* The result bytes will be the same size */ + + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + offset = stringlib_find(self_s, self_len, + from_s, from_len, + 0); + if (offset == -1) { + /* No matches; return the original bytes */ + return return_self(self); + } + + /* Need to make a new bytes */ + result = STRINGLIB_NEW(NULL, self_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + memcpy(result_s, self_s, self_len); + + /* change everything in-place, starting with this one */ + start = result_s + offset; + memcpy(start, to_s, from_len); + start += from_len; + end = result_s + self_len; + + while ( --maxcount > 0) { + offset = stringlib_find(start, end - start, + from_s, from_len, + 0); + if (offset == -1) + break; + memcpy(start + offset, to_s, from_len); + start += offset + from_len; + } + + return result; +} + +/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ +static PyObject * +stringlib_replace_single_character(PyObject *self, + char from_c, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count; + PyObject *result; + + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + count = countchar(self_s, self_len, from_c, maxcount); + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + /* use the difference between current and new, hence the "-1" */ + /* result_len = self_len + count * (to_len-1) */ + assert(count > 0); + if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) { + PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); + return NULL; + } + result_len = self_len + count * (to_len - 1); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end - start, from_c); + if (next == NULL) + break; + + if (next == start) { + /* replace with the 'to' */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + start += 1; + } else { + /* copy the unchanged old then the 'to' */ + memcpy(result_s, start, next - start); + result_s += (next - start); + memcpy(result_s, to_s, to_len); + result_s += to_len; + start = next + 1; + } + } + /* Copy the remainder of the remaining bytes */ + memcpy(result_s, start, end - start); + + return result; +} + +/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ +static PyObject * +stringlib_replace_substring(PyObject *self, + const char *from_s, Py_ssize_t from_len, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ + const char *self_s, *start, *next, *end; + char *result_s; + Py_ssize_t self_len, result_len; + Py_ssize_t count, offset; + PyObject *result; + + self_s = STRINGLIB_STR(self); + self_len = STRINGLIB_LEN(self); + + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); + + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + /* Check for overflow */ + /* result_len = self_len + count * (to_len-from_len) */ + assert(count > 0); + if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) { + PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); + return NULL; + } + result_len = self_len + count * (to_len - from_len); + + result = STRINGLIB_NEW(NULL, result_len); + if (result == NULL) { + return NULL; + } + result_s = STRINGLIB_STR(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = stringlib_find(start, end - start, + from_s, from_len, + 0); + if (offset == -1) + break; + next = start + offset; + if (next == start) { + /* replace with the 'to' */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + start += from_len; + } else { + /* copy the unchanged old then the 'to' */ + memcpy(result_s, start, next - start); + result_s += (next - start); + memcpy(result_s, to_s, to_len); + result_s += to_len; + start = next + from_len; + } + } + /* Copy the remainder of the remaining bytes */ + memcpy(result_s, start, end - start); + + return result; +} + + +static PyObject * +stringlib_replace(PyObject *self, + const char *from_s, Py_ssize_t from_len, + const char *to_s, Py_ssize_t to_len, + Py_ssize_t maxcount) +{ if (STRINGLIB_LEN(self) < from_len) { /* nothing to do; return the original bytes */ return return_self(self); } - if (maxcount < 0) { - maxcount = PY_SSIZE_T_MAX; + if (maxcount < 0) { + maxcount = PY_SSIZE_T_MAX; } else if (maxcount == 0) { - /* nothing to do; return the original bytes */ - return return_self(self); - } - - /* Handle zero-length special cases */ - if (from_len == 0) { - if (to_len == 0) { - /* nothing to do; return the original bytes */ - return return_self(self); - } - /* insert the 'to' bytes everywhere. */ - /* >>> b"Python".replace(b"", b".") */ - /* b'.P.y.t.h.o.n.' */ - return stringlib_replace_interleave(self, to_s, to_len, maxcount); - } - - if (to_len == 0) { - /* delete all occurrences of 'from' bytes */ - if (from_len == 1) { - return stringlib_replace_delete_single_character( - self, from_s[0], maxcount); - } else { - return stringlib_replace_delete_substring( - self, from_s, from_len, maxcount); - } - } - - /* Handle special case where both bytes have the same length */ - - if (from_len == to_len) { - if (from_len == 1) { - return stringlib_replace_single_character_in_place( - self, from_s[0], to_s[0], maxcount); - } else { - return stringlib_replace_substring_in_place( - self, from_s, from_len, to_s, to_len, maxcount); - } - } - - /* Otherwise use the more generic algorithms */ - if (from_len == 1) { - return stringlib_replace_single_character( - self, from_s[0], to_s, to_len, maxcount); - } else { - /* len('from')>=2, len('to')>=1 */ - return stringlib_replace_substring( - self, from_s, from_len, to_s, to_len, maxcount); - } -} - -#undef findchar + /* nothing to do; return the original bytes */ + return return_self(self); + } + + /* Handle zero-length special cases */ + if (from_len == 0) { + if (to_len == 0) { + /* nothing to do; return the original bytes */ + return return_self(self); + } + /* insert the 'to' bytes everywhere. */ + /* >>> b"Python".replace(b"", b".") */ + /* b'.P.y.t.h.o.n.' */ + return stringlib_replace_interleave(self, to_s, to_len, maxcount); + } + + if (to_len == 0) { + /* delete all occurrences of 'from' bytes */ + if (from_len == 1) { + return stringlib_replace_delete_single_character( + self, from_s[0], maxcount); + } else { + return stringlib_replace_delete_substring( + self, from_s, from_len, maxcount); + } + } + + /* Handle special case where both bytes have the same length */ + + if (from_len == to_len) { + if (from_len == 1) { + return stringlib_replace_single_character_in_place( + self, from_s[0], to_s[0], maxcount); + } else { + return stringlib_replace_substring_in_place( + self, from_s, from_len, to_s, to_len, maxcount); + } + } + + /* Otherwise use the more generic algorithms */ + if (from_len == 1) { + return stringlib_replace_single_character( + self, from_s[0], to_s, to_len, maxcount); + } else { + /* len('from')>=2, len('to')>=1 */ + return stringlib_replace_substring( + self, from_s, from_len, to_s, to_len, maxcount); + } +} + +#undef findchar diff --git a/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h b/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h index e1776bd840..bc4b104f11 100644 --- a/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h +++ b/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h @@ -1,26 +1,26 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH ucs1lib_fastsearch -#define STRINGLIB(F) ucs1lib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 1 -#define STRINGLIB_MAX_CHAR 0xFFu -#define STRINGLIB_CHAR Py_UCS1 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_1BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH -#define STRINGLIB_NEW _PyUnicode_FromUCS1 -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define FASTSEARCH ucs1lib_fastsearch +#define STRINGLIB(F) ucs1lib_##F +#define STRINGLIB_OBJECT PyUnicodeObject +#define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_MAX_CHAR 0xFFu +#define STRINGLIB_CHAR Py_UCS1 +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_PARSE_CODE "U" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_STR PyUnicode_1BYTE_DATA +#define STRINGLIB_LEN PyUnicode_GET_LENGTH +#define STRINGLIB_NEW _PyUnicode_FromUCS1 +#define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact + +#define STRINGLIB_TOSTR PyObject_Str +#define STRINGLIB_TOASCII PyObject_ASCII diff --git a/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h b/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h index 36efc36268..86a1dff1b5 100644 --- a/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h +++ b/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h @@ -1,26 +1,26 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH ucs2lib_fastsearch -#define STRINGLIB(F) ucs2lib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 2 -#define STRINGLIB_MAX_CHAR 0xFFFFu -#define STRINGLIB_CHAR Py_UCS2 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_2BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH -#define STRINGLIB_NEW _PyUnicode_FromUCS2 -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define FASTSEARCH ucs2lib_fastsearch +#define STRINGLIB(F) ucs2lib_##F +#define STRINGLIB_OBJECT PyUnicodeObject +#define STRINGLIB_SIZEOF_CHAR 2 +#define STRINGLIB_MAX_CHAR 0xFFFFu +#define STRINGLIB_CHAR Py_UCS2 +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_PARSE_CODE "U" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_STR PyUnicode_2BYTE_DATA +#define STRINGLIB_LEN PyUnicode_GET_LENGTH +#define STRINGLIB_NEW _PyUnicode_FromUCS2 +#define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact + +#define STRINGLIB_TOSTR PyObject_Str +#define STRINGLIB_TOASCII PyObject_ASCII diff --git a/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h b/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h index 86e7fa630f..3c32a93c96 100644 --- a/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h +++ b/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h @@ -1,27 +1,27 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH ucs4lib_fastsearch -#define STRINGLIB(F) ucs4lib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 4 -#define STRINGLIB_MAX_CHAR 0x10FFFFu -#define STRINGLIB_CHAR Py_UCS4 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_4BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH -#define STRINGLIB_NEW _PyUnicode_FromUCS4 -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII - +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define FASTSEARCH ucs4lib_fastsearch +#define STRINGLIB(F) ucs4lib_##F +#define STRINGLIB_OBJECT PyUnicodeObject +#define STRINGLIB_SIZEOF_CHAR 4 +#define STRINGLIB_MAX_CHAR 0x10FFFFu +#define STRINGLIB_CHAR Py_UCS4 +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_PARSE_CODE "U" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_STR PyUnicode_4BYTE_DATA +#define STRINGLIB_LEN PyUnicode_GET_LENGTH +#define STRINGLIB_NEW _PyUnicode_FromUCS4 +#define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact + +#define STRINGLIB_TOSTR PyObject_Str +#define STRINGLIB_TOASCII PyObject_ASCII + diff --git a/contrib/tools/python3/src/Objects/stringlib/undef.h b/contrib/tools/python3/src/Objects/stringlib/undef.h index 53f74b4371..c41e254fde 100644 --- a/contrib/tools/python3/src/Objects/stringlib/undef.h +++ b/contrib/tools/python3/src/Objects/stringlib/undef.h @@ -1,10 +1,10 @@ -#undef FASTSEARCH -#undef STRINGLIB -#undef STRINGLIB_SIZEOF_CHAR -#undef STRINGLIB_MAX_CHAR -#undef STRINGLIB_CHAR -#undef STRINGLIB_STR -#undef STRINGLIB_LEN -#undef STRINGLIB_NEW -#undef STRINGLIB_IS_UNICODE - +#undef FASTSEARCH +#undef STRINGLIB +#undef STRINGLIB_SIZEOF_CHAR +#undef STRINGLIB_MAX_CHAR +#undef STRINGLIB_CHAR +#undef STRINGLIB_STR +#undef STRINGLIB_LEN +#undef STRINGLIB_NEW +#undef STRINGLIB_IS_UNICODE + diff --git a/contrib/tools/python3/src/Objects/stringlib/unicode_format.h b/contrib/tools/python3/src/Objects/stringlib/unicode_format.h index 96b820d0a4..b526ad21b8 100644 --- a/contrib/tools/python3/src/Objects/stringlib/unicode_format.h +++ b/contrib/tools/python3/src/Objects/stringlib/unicode_format.h @@ -1,445 +1,445 @@ -/* - unicode_format.h -- implementation of str.format(). -*/ - -/************************************************************************/ -/*********** Global data structures and forward declarations *********/ -/************************************************************************/ - -/* - A SubString consists of the characters between two string or - unicode pointers. -*/ -typedef struct { - PyObject *str; /* borrowed reference */ - Py_ssize_t start, end; -} SubString; - - -typedef enum { - ANS_INIT, - ANS_AUTO, - ANS_MANUAL -} AutoNumberState; /* Keep track if we're auto-numbering fields */ - -/* Keeps track of our auto-numbering state, and which number field we're on */ -typedef struct { - AutoNumberState an_state; - int an_field_number; -} AutoNumber; - - -/* forward declaration for recursion */ -static PyObject * -build_string(SubString *input, PyObject *args, PyObject *kwargs, - int recursion_depth, AutoNumber *auto_number); - - - -/************************************************************************/ -/************************** Utility functions ************************/ -/************************************************************************/ - -static void -AutoNumber_Init(AutoNumber *auto_number) -{ - auto_number->an_state = ANS_INIT; - auto_number->an_field_number = 0; -} - -/* fill in a SubString from a pointer and length */ -Py_LOCAL_INLINE(void) -SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end) -{ - str->str = s; - str->start = start; - str->end = end; -} - -/* return a new string. if str->str is NULL, return None */ -Py_LOCAL_INLINE(PyObject *) -SubString_new_object(SubString *str) -{ - if (str->str == NULL) - Py_RETURN_NONE; - return PyUnicode_Substring(str->str, str->start, str->end); -} - -/* return a new string. if str->str is NULL, return a new empty string */ -Py_LOCAL_INLINE(PyObject *) -SubString_new_object_or_empty(SubString *str) -{ - if (str->str == NULL) { - return PyUnicode_New(0, 0); - } - return SubString_new_object(str); -} - -/* Return 1 if an error has been detected switching between automatic - field numbering and manual field specification, else return 0. Set - ValueError on error. */ -static int -autonumber_state_error(AutoNumberState state, int field_name_is_empty) -{ - if (state == ANS_MANUAL) { - if (field_name_is_empty) { - PyErr_SetString(PyExc_ValueError, "cannot switch from " - "manual field specification to " - "automatic field numbering"); - return 1; - } - } - else { - if (!field_name_is_empty) { - PyErr_SetString(PyExc_ValueError, "cannot switch from " - "automatic field numbering to " - "manual field specification"); - return 1; - } - } - return 0; -} - - -/************************************************************************/ -/*********** Format string parsing -- integers and identifiers *********/ -/************************************************************************/ - -static Py_ssize_t -get_integer(const SubString *str) -{ - Py_ssize_t accumulator = 0; - Py_ssize_t digitval; - Py_ssize_t i; - - /* empty string is an error */ - if (str->start >= str->end) - return -1; - - for (i = str->start; i < str->end; i++) { - digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i)); - if (digitval < 0) - return -1; - /* - Detect possible overflow before it happens: - - accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if - accumulator > (PY_SSIZE_T_MAX - digitval) / 10. - */ - if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) { - PyErr_Format(PyExc_ValueError, - "Too many decimal digits in format string"); - return -1; - } - accumulator = accumulator * 10 + digitval; - } - return accumulator; -} - -/************************************************************************/ -/******** Functions to get field objects and specification strings ******/ -/************************************************************************/ - -/* do the equivalent of obj.name */ -static PyObject * -getattr(PyObject *obj, SubString *name) -{ - PyObject *newobj; - PyObject *str = SubString_new_object(name); - if (str == NULL) - return NULL; - newobj = PyObject_GetAttr(obj, str); - Py_DECREF(str); - return newobj; -} - -/* do the equivalent of obj[idx], where obj is a sequence */ -static PyObject * -getitem_sequence(PyObject *obj, Py_ssize_t idx) -{ - return PySequence_GetItem(obj, idx); -} - -/* do the equivalent of obj[idx], where obj is not a sequence */ -static PyObject * -getitem_idx(PyObject *obj, Py_ssize_t idx) -{ - PyObject *newobj; - PyObject *idx_obj = PyLong_FromSsize_t(idx); - if (idx_obj == NULL) - return NULL; - newobj = PyObject_GetItem(obj, idx_obj); - Py_DECREF(idx_obj); - return newobj; -} - -/* do the equivalent of obj[name] */ -static PyObject * -getitem_str(PyObject *obj, SubString *name) -{ - PyObject *newobj; - PyObject *str = SubString_new_object(name); - if (str == NULL) - return NULL; - newobj = PyObject_GetItem(obj, str); - Py_DECREF(str); - return newobj; -} - -typedef struct { - /* the entire string we're parsing. we assume that someone else - is managing its lifetime, and that it will exist for the - lifetime of the iterator. can be empty */ - SubString str; - - /* index to where we are inside field_name */ - Py_ssize_t index; -} FieldNameIterator; - - -static int -FieldNameIterator_init(FieldNameIterator *self, PyObject *s, - Py_ssize_t start, Py_ssize_t end) -{ - SubString_init(&self->str, s, start, end); - self->index = start; - return 1; -} - -static int -_FieldNameIterator_attr(FieldNameIterator *self, SubString *name) -{ - Py_UCS4 c; - - name->str = self->str.str; - name->start = self->index; - - /* return everything until '.' or '[' */ - while (self->index < self->str.end) { - c = PyUnicode_READ_CHAR(self->str.str, self->index++); - switch (c) { - case '[': - case '.': - /* backup so that we this character will be seen next time */ - self->index--; - break; - default: - continue; - } - break; - } - /* end of string is okay */ - name->end = self->index; - return 1; -} - -static int -_FieldNameIterator_item(FieldNameIterator *self, SubString *name) -{ - int bracket_seen = 0; - Py_UCS4 c; - - name->str = self->str.str; - name->start = self->index; - - /* return everything until ']' */ - while (self->index < self->str.end) { - c = PyUnicode_READ_CHAR(self->str.str, self->index++); - switch (c) { - case ']': - bracket_seen = 1; - break; - default: - continue; - } - break; - } - /* make sure we ended with a ']' */ - if (!bracket_seen) { - PyErr_SetString(PyExc_ValueError, "Missing ']' in format string"); - return 0; - } - - /* end of string is okay */ - /* don't include the ']' */ - name->end = self->index-1; - return 1; -} - -/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */ -static int -FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, - Py_ssize_t *name_idx, SubString *name) -{ - /* check at end of input */ - if (self->index >= self->str.end) - return 1; - - switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) { - case '.': - *is_attribute = 1; - if (_FieldNameIterator_attr(self, name) == 0) - return 0; - *name_idx = -1; - break; - case '[': - *is_attribute = 0; - if (_FieldNameIterator_item(self, name) == 0) - return 0; - *name_idx = get_integer(name); - if (*name_idx == -1 && PyErr_Occurred()) - return 0; - break; - default: - /* Invalid character follows ']' */ - PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may " - "follow ']' in format field specifier"); - return 0; - } - - /* empty string is an error */ - if (name->start == name->end) { - PyErr_SetString(PyExc_ValueError, "Empty attribute in format string"); - return 0; - } - - return 2; -} - - -/* input: field_name - output: 'first' points to the part before the first '[' or '.' - 'first_idx' is -1 if 'first' is not an integer, otherwise - it's the value of first converted to an integer - 'rest' is an iterator to return the rest -*/ -static int -field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first, - Py_ssize_t *first_idx, FieldNameIterator *rest, - AutoNumber *auto_number) -{ - Py_UCS4 c; - Py_ssize_t i = start; - int field_name_is_empty; - int using_numeric_index; - - /* find the part up until the first '.' or '[' */ - while (i < end) { - switch (c = PyUnicode_READ_CHAR(str, i++)) { - case '[': - case '.': - /* backup so that we this character is available to the - "rest" iterator */ - i--; - break; - default: - continue; - } - break; - } - - /* set up the return values */ - SubString_init(first, str, start, i); - FieldNameIterator_init(rest, str, i, end); - - /* see if "first" is an integer, in which case it's used as an index */ - *first_idx = get_integer(first); - if (*first_idx == -1 && PyErr_Occurred()) - return 0; - - field_name_is_empty = first->start >= first->end; - - /* If the field name is omitted or if we have a numeric index - specified, then we're doing numeric indexing into args. */ - using_numeric_index = field_name_is_empty || *first_idx != -1; - - /* We always get here exactly one time for each field we're - processing. And we get here in field order (counting by left - braces). So this is the perfect place to handle automatic field - numbering if the field name is omitted. */ - - /* Check if we need to do the auto-numbering. It's not needed if - we're called from string.Format routines, because it's handled - in that class by itself. */ - if (auto_number) { - /* Initialize our auto numbering state if this is the first - time we're either auto-numbering or manually numbering. */ - if (auto_number->an_state == ANS_INIT && using_numeric_index) - auto_number->an_state = field_name_is_empty ? - ANS_AUTO : ANS_MANUAL; - - /* Make sure our state is consistent with what we're doing - this time through. Only check if we're using a numeric - index. */ - if (using_numeric_index) - if (autonumber_state_error(auto_number->an_state, - field_name_is_empty)) - return 0; - /* Zero length field means we want to do auto-numbering of the - fields. */ - if (field_name_is_empty) - *first_idx = (auto_number->an_field_number)++; - } - - return 1; -} - - -/* - get_field_object returns the object inside {}, before the - format_spec. It handles getindex and getattr lookups and consumes - the entire input string. -*/ -static PyObject * -get_field_object(SubString *input, PyObject *args, PyObject *kwargs, - AutoNumber *auto_number) -{ - PyObject *obj = NULL; - int ok; - int is_attribute; - SubString name; - SubString first; - Py_ssize_t index; - FieldNameIterator rest; - - if (!field_name_split(input->str, input->start, input->end, &first, - &index, &rest, auto_number)) { - goto error; - } - - if (index == -1) { - /* look up in kwargs */ - PyObject *key = SubString_new_object(&first); - if (key == NULL) { - goto error; - } - if (kwargs == NULL) { - PyErr_SetObject(PyExc_KeyError, key); - Py_DECREF(key); - goto error; - } - /* Use PyObject_GetItem instead of PyDict_GetItem because this - code is no longer just used with kwargs. It might be passed - a non-dict when called through format_map. */ - obj = PyObject_GetItem(kwargs, key); - Py_DECREF(key); - if (obj == NULL) { - goto error; - } - } - else { - /* If args is NULL, we have a format string with a positional field - with only kwargs to retrieve it from. This can only happen when - used with format_map(), where positional arguments are not - allowed. */ - if (args == NULL) { - PyErr_SetString(PyExc_ValueError, "Format string contains " - "positional fields"); - goto error; - } - - /* look up in args */ - obj = PySequence_GetItem(args, index); +/* + unicode_format.h -- implementation of str.format(). +*/ + +/************************************************************************/ +/*********** Global data structures and forward declarations *********/ +/************************************************************************/ + +/* + A SubString consists of the characters between two string or + unicode pointers. +*/ +typedef struct { + PyObject *str; /* borrowed reference */ + Py_ssize_t start, end; +} SubString; + + +typedef enum { + ANS_INIT, + ANS_AUTO, + ANS_MANUAL +} AutoNumberState; /* Keep track if we're auto-numbering fields */ + +/* Keeps track of our auto-numbering state, and which number field we're on */ +typedef struct { + AutoNumberState an_state; + int an_field_number; +} AutoNumber; + + +/* forward declaration for recursion */ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int recursion_depth, AutoNumber *auto_number); + + + +/************************************************************************/ +/************************** Utility functions ************************/ +/************************************************************************/ + +static void +AutoNumber_Init(AutoNumber *auto_number) +{ + auto_number->an_state = ANS_INIT; + auto_number->an_field_number = 0; +} + +/* fill in a SubString from a pointer and length */ +Py_LOCAL_INLINE(void) +SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end) +{ + str->str = s; + str->start = start; + str->end = end; +} + +/* return a new string. if str->str is NULL, return None */ +Py_LOCAL_INLINE(PyObject *) +SubString_new_object(SubString *str) +{ + if (str->str == NULL) + Py_RETURN_NONE; + return PyUnicode_Substring(str->str, str->start, str->end); +} + +/* return a new string. if str->str is NULL, return a new empty string */ +Py_LOCAL_INLINE(PyObject *) +SubString_new_object_or_empty(SubString *str) +{ + if (str->str == NULL) { + return PyUnicode_New(0, 0); + } + return SubString_new_object(str); +} + +/* Return 1 if an error has been detected switching between automatic + field numbering and manual field specification, else return 0. Set + ValueError on error. */ +static int +autonumber_state_error(AutoNumberState state, int field_name_is_empty) +{ + if (state == ANS_MANUAL) { + if (field_name_is_empty) { + PyErr_SetString(PyExc_ValueError, "cannot switch from " + "manual field specification to " + "automatic field numbering"); + return 1; + } + } + else { + if (!field_name_is_empty) { + PyErr_SetString(PyExc_ValueError, "cannot switch from " + "automatic field numbering to " + "manual field specification"); + return 1; + } + } + return 0; +} + + +/************************************************************************/ +/*********** Format string parsing -- integers and identifiers *********/ +/************************************************************************/ + +static Py_ssize_t +get_integer(const SubString *str) +{ + Py_ssize_t accumulator = 0; + Py_ssize_t digitval; + Py_ssize_t i; + + /* empty string is an error */ + if (str->start >= str->end) + return -1; + + for (i = str->start; i < str->end; i++) { + digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i)); + if (digitval < 0) + return -1; + /* + Detect possible overflow before it happens: + + accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if + accumulator > (PY_SSIZE_T_MAX - digitval) / 10. + */ + if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) { + PyErr_Format(PyExc_ValueError, + "Too many decimal digits in format string"); + return -1; + } + accumulator = accumulator * 10 + digitval; + } + return accumulator; +} + +/************************************************************************/ +/******** Functions to get field objects and specification strings ******/ +/************************************************************************/ + +/* do the equivalent of obj.name */ +static PyObject * +getattr(PyObject *obj, SubString *name) +{ + PyObject *newobj; + PyObject *str = SubString_new_object(name); + if (str == NULL) + return NULL; + newobj = PyObject_GetAttr(obj, str); + Py_DECREF(str); + return newobj; +} + +/* do the equivalent of obj[idx], where obj is a sequence */ +static PyObject * +getitem_sequence(PyObject *obj, Py_ssize_t idx) +{ + return PySequence_GetItem(obj, idx); +} + +/* do the equivalent of obj[idx], where obj is not a sequence */ +static PyObject * +getitem_idx(PyObject *obj, Py_ssize_t idx) +{ + PyObject *newobj; + PyObject *idx_obj = PyLong_FromSsize_t(idx); + if (idx_obj == NULL) + return NULL; + newobj = PyObject_GetItem(obj, idx_obj); + Py_DECREF(idx_obj); + return newobj; +} + +/* do the equivalent of obj[name] */ +static PyObject * +getitem_str(PyObject *obj, SubString *name) +{ + PyObject *newobj; + PyObject *str = SubString_new_object(name); + if (str == NULL) + return NULL; + newobj = PyObject_GetItem(obj, str); + Py_DECREF(str); + return newobj; +} + +typedef struct { + /* the entire string we're parsing. we assume that someone else + is managing its lifetime, and that it will exist for the + lifetime of the iterator. can be empty */ + SubString str; + + /* index to where we are inside field_name */ + Py_ssize_t index; +} FieldNameIterator; + + +static int +FieldNameIterator_init(FieldNameIterator *self, PyObject *s, + Py_ssize_t start, Py_ssize_t end) +{ + SubString_init(&self->str, s, start, end); + self->index = start; + return 1; +} + +static int +_FieldNameIterator_attr(FieldNameIterator *self, SubString *name) +{ + Py_UCS4 c; + + name->str = self->str.str; + name->start = self->index; + + /* return everything until '.' or '[' */ + while (self->index < self->str.end) { + c = PyUnicode_READ_CHAR(self->str.str, self->index++); + switch (c) { + case '[': + case '.': + /* backup so that we this character will be seen next time */ + self->index--; + break; + default: + continue; + } + break; + } + /* end of string is okay */ + name->end = self->index; + return 1; +} + +static int +_FieldNameIterator_item(FieldNameIterator *self, SubString *name) +{ + int bracket_seen = 0; + Py_UCS4 c; + + name->str = self->str.str; + name->start = self->index; + + /* return everything until ']' */ + while (self->index < self->str.end) { + c = PyUnicode_READ_CHAR(self->str.str, self->index++); + switch (c) { + case ']': + bracket_seen = 1; + break; + default: + continue; + } + break; + } + /* make sure we ended with a ']' */ + if (!bracket_seen) { + PyErr_SetString(PyExc_ValueError, "Missing ']' in format string"); + return 0; + } + + /* end of string is okay */ + /* don't include the ']' */ + name->end = self->index-1; + return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */ +static int +FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, + Py_ssize_t *name_idx, SubString *name) +{ + /* check at end of input */ + if (self->index >= self->str.end) + return 1; + + switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) { + case '.': + *is_attribute = 1; + if (_FieldNameIterator_attr(self, name) == 0) + return 0; + *name_idx = -1; + break; + case '[': + *is_attribute = 0; + if (_FieldNameIterator_item(self, name) == 0) + return 0; + *name_idx = get_integer(name); + if (*name_idx == -1 && PyErr_Occurred()) + return 0; + break; + default: + /* Invalid character follows ']' */ + PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may " + "follow ']' in format field specifier"); + return 0; + } + + /* empty string is an error */ + if (name->start == name->end) { + PyErr_SetString(PyExc_ValueError, "Empty attribute in format string"); + return 0; + } + + return 2; +} + + +/* input: field_name + output: 'first' points to the part before the first '[' or '.' + 'first_idx' is -1 if 'first' is not an integer, otherwise + it's the value of first converted to an integer + 'rest' is an iterator to return the rest +*/ +static int +field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first, + Py_ssize_t *first_idx, FieldNameIterator *rest, + AutoNumber *auto_number) +{ + Py_UCS4 c; + Py_ssize_t i = start; + int field_name_is_empty; + int using_numeric_index; + + /* find the part up until the first '.' or '[' */ + while (i < end) { + switch (c = PyUnicode_READ_CHAR(str, i++)) { + case '[': + case '.': + /* backup so that we this character is available to the + "rest" iterator */ + i--; + break; + default: + continue; + } + break; + } + + /* set up the return values */ + SubString_init(first, str, start, i); + FieldNameIterator_init(rest, str, i, end); + + /* see if "first" is an integer, in which case it's used as an index */ + *first_idx = get_integer(first); + if (*first_idx == -1 && PyErr_Occurred()) + return 0; + + field_name_is_empty = first->start >= first->end; + + /* If the field name is omitted or if we have a numeric index + specified, then we're doing numeric indexing into args. */ + using_numeric_index = field_name_is_empty || *first_idx != -1; + + /* We always get here exactly one time for each field we're + processing. And we get here in field order (counting by left + braces). So this is the perfect place to handle automatic field + numbering if the field name is omitted. */ + + /* Check if we need to do the auto-numbering. It's not needed if + we're called from string.Format routines, because it's handled + in that class by itself. */ + if (auto_number) { + /* Initialize our auto numbering state if this is the first + time we're either auto-numbering or manually numbering. */ + if (auto_number->an_state == ANS_INIT && using_numeric_index) + auto_number->an_state = field_name_is_empty ? + ANS_AUTO : ANS_MANUAL; + + /* Make sure our state is consistent with what we're doing + this time through. Only check if we're using a numeric + index. */ + if (using_numeric_index) + if (autonumber_state_error(auto_number->an_state, + field_name_is_empty)) + return 0; + /* Zero length field means we want to do auto-numbering of the + fields. */ + if (field_name_is_empty) + *first_idx = (auto_number->an_field_number)++; + } + + return 1; +} + + +/* + get_field_object returns the object inside {}, before the + format_spec. It handles getindex and getattr lookups and consumes + the entire input string. +*/ +static PyObject * +get_field_object(SubString *input, PyObject *args, PyObject *kwargs, + AutoNumber *auto_number) +{ + PyObject *obj = NULL; + int ok; + int is_attribute; + SubString name; + SubString first; + Py_ssize_t index; + FieldNameIterator rest; + + if (!field_name_split(input->str, input->start, input->end, &first, + &index, &rest, auto_number)) { + goto error; + } + + if (index == -1) { + /* look up in kwargs */ + PyObject *key = SubString_new_object(&first); + if (key == NULL) { + goto error; + } + if (kwargs == NULL) { + PyErr_SetObject(PyExc_KeyError, key); + Py_DECREF(key); + goto error; + } + /* Use PyObject_GetItem instead of PyDict_GetItem because this + code is no longer just used with kwargs. It might be passed + a non-dict when called through format_map. */ + obj = PyObject_GetItem(kwargs, key); + Py_DECREF(key); + if (obj == NULL) { + goto error; + } + } + else { + /* If args is NULL, we have a format string with a positional field + with only kwargs to retrieve it from. This can only happen when + used with format_map(), where positional arguments are not + allowed. */ + if (args == NULL) { + PyErr_SetString(PyExc_ValueError, "Format string contains " + "positional fields"); + goto error; + } + + /* look up in args */ + obj = PySequence_GetItem(args, index); if (obj == NULL) { PyErr_Format(PyExc_IndexError, "Replacement index %zd out of range for positional " @@ -447,845 +447,845 @@ get_field_object(SubString *input, PyObject *args, PyObject *kwargs, index); goto error; } - } - - /* iterate over the rest of the field_name */ - while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index, - &name)) == 2) { - PyObject *tmp; - - if (is_attribute) - /* getattr lookup "." */ - tmp = getattr(obj, &name); - else - /* getitem lookup "[]" */ - if (index == -1) - tmp = getitem_str(obj, &name); - else - if (PySequence_Check(obj)) - tmp = getitem_sequence(obj, index); - else - /* not a sequence */ - tmp = getitem_idx(obj, index); - if (tmp == NULL) - goto error; - - /* assign to obj */ - Py_DECREF(obj); - obj = tmp; - } - /* end of iterator, this is the non-error case */ - if (ok == 1) - return obj; -error: - Py_XDECREF(obj); - return NULL; -} - -/************************************************************************/ -/***************** Field rendering functions **************************/ -/************************************************************************/ - -/* - render_field() is the main function in this section. It takes the - field object and field specification string generated by - get_field_and_spec, and renders the field into the output string. - - render_field calls fieldobj.__format__(format_spec) method, and - appends to the output. -*/ -static int -render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer) -{ - int ok = 0; - PyObject *result = NULL; - PyObject *format_spec_object = NULL; - int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL; - int err; - - /* If we know the type exactly, skip the lookup of __format__ and just - call the formatter directly. */ - if (PyUnicode_CheckExact(fieldobj)) - formatter = _PyUnicode_FormatAdvancedWriter; - else if (PyLong_CheckExact(fieldobj)) - formatter = _PyLong_FormatAdvancedWriter; - else if (PyFloat_CheckExact(fieldobj)) - formatter = _PyFloat_FormatAdvancedWriter; - else if (PyComplex_CheckExact(fieldobj)) - formatter = _PyComplex_FormatAdvancedWriter; - - if (formatter) { - /* we know exactly which formatter will be called when __format__ is - looked up, so call it directly, instead. */ - err = formatter(writer, fieldobj, format_spec->str, - format_spec->start, format_spec->end); - return (err == 0); - } - else { - /* We need to create an object out of the pointers we have, because - __format__ takes a string/unicode object for format_spec. */ - if (format_spec->str) - format_spec_object = PyUnicode_Substring(format_spec->str, - format_spec->start, - format_spec->end); - else - format_spec_object = PyUnicode_New(0, 0); - if (format_spec_object == NULL) - goto done; - - result = PyObject_Format(fieldobj, format_spec_object); - } - if (result == NULL) - goto done; - - if (_PyUnicodeWriter_WriteStr(writer, result) == -1) - goto done; - ok = 1; - -done: - Py_XDECREF(format_spec_object); - Py_XDECREF(result); - return ok; -} - -static int -parse_field(SubString *str, SubString *field_name, SubString *format_spec, - int *format_spec_needs_expanding, Py_UCS4 *conversion) -{ - /* Note this function works if the field name is zero length, - which is good. Zero length field names are handled later, in - field_name_split. */ - - Py_UCS4 c = 0; - - /* initialize these, as they may be empty */ - *conversion = '\0'; - SubString_init(format_spec, NULL, 0, 0); - - /* Search for the field name. it's terminated by the end of - the string, or a ':' or '!' */ - field_name->str = str->str; - field_name->start = str->start; - while (str->start < str->end) { - switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { - case '{': - PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name"); - return 0; - case '[': - for (; str->start < str->end; str->start++) - if (PyUnicode_READ_CHAR(str->str, str->start) == ']') - break; - continue; - case '}': - case ':': - case '!': - break; - default: - continue; - } - break; - } - - field_name->end = str->start - 1; - if (c == '!' || c == ':') { - Py_ssize_t count; - /* we have a format specifier and/or a conversion */ - /* don't include the last character */ - - /* see if there's a conversion specifier */ - if (c == '!') { - /* there must be another character present */ - if (str->start >= str->end) { - PyErr_SetString(PyExc_ValueError, - "end of string while looking for conversion " - "specifier"); - return 0; - } - *conversion = PyUnicode_READ_CHAR(str->str, str->start++); - - if (str->start < str->end) { - c = PyUnicode_READ_CHAR(str->str, str->start++); - if (c == '}') - return 1; - if (c != ':') { - PyErr_SetString(PyExc_ValueError, - "expected ':' after conversion specifier"); - return 0; - } - } - } - format_spec->str = str->str; - format_spec->start = str->start; - count = 1; - while (str->start < str->end) { - switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { - case '{': - *format_spec_needs_expanding = 1; - count++; - break; - case '}': - count--; - if (count == 0) { - format_spec->end = str->start - 1; - return 1; - } - break; - default: - break; - } - } - - PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec"); - return 0; - } - else if (c != '}') { - PyErr_SetString(PyExc_ValueError, "expected '}' before end of string"); - return 0; - } - - return 1; -} - -/************************************************************************/ -/******* Output string allocation and escape-to-markup processing ******/ -/************************************************************************/ - -/* MarkupIterator breaks the string into pieces of either literal - text, or things inside {} that need to be marked up. it is - designed to make it easy to wrap a Python iterator around it, for - use with the Formatter class */ - -typedef struct { - SubString str; -} MarkupIterator; - -static int -MarkupIterator_init(MarkupIterator *self, PyObject *str, - Py_ssize_t start, Py_ssize_t end) -{ - SubString_init(&self->str, str, start, end); - return 1; -} - -/* returns 0 on error, 1 on non-error termination, and 2 if it got a - string (or something to be expanded) */ -static int -MarkupIterator_next(MarkupIterator *self, SubString *literal, - int *field_present, SubString *field_name, - SubString *format_spec, Py_UCS4 *conversion, - int *format_spec_needs_expanding) -{ - int at_end; - Py_UCS4 c = 0; - Py_ssize_t start; - Py_ssize_t len; - int markup_follows = 0; - - /* initialize all of the output variables */ - SubString_init(literal, NULL, 0, 0); - SubString_init(field_name, NULL, 0, 0); - SubString_init(format_spec, NULL, 0, 0); - *conversion = '\0'; - *format_spec_needs_expanding = 0; - *field_present = 0; - - /* No more input, end of iterator. This is the normal exit - path. */ - if (self->str.start >= self->str.end) - return 1; - - start = self->str.start; - - /* First read any literal text. Read until the end of string, an - escaped '{' or '}', or an unescaped '{'. In order to never - allocate memory and so I can just pass pointers around, if - there's an escaped '{' or '}' then we'll return the literal - including the brace, but no format object. The next time - through, we'll return the rest of the literal, skipping past - the second consecutive brace. */ - while (self->str.start < self->str.end) { - switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) { - case '{': - case '}': - markup_follows = 1; - break; - default: - continue; - } - break; - } - - at_end = self->str.start >= self->str.end; - len = self->str.start - start; - - if ((c == '}') && (at_end || - (c != PyUnicode_READ_CHAR(self->str.str, - self->str.start)))) { - PyErr_SetString(PyExc_ValueError, "Single '}' encountered " - "in format string"); - return 0; - } - if (at_end && c == '{') { - PyErr_SetString(PyExc_ValueError, "Single '{' encountered " - "in format string"); - return 0; - } - if (!at_end) { - if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) { - /* escaped } or {, skip it in the input. there is no - markup object following us, just this literal text */ - self->str.start++; - markup_follows = 0; - } - else - len--; - } - - /* record the literal text */ - literal->str = self->str.str; - literal->start = start; - literal->end = start + len; - - if (!markup_follows) - return 2; - - /* this is markup; parse the field */ - *field_present = 1; - if (!parse_field(&self->str, field_name, format_spec, - format_spec_needs_expanding, conversion)) - return 0; - return 2; -} - - -/* do the !r or !s conversion on obj */ -static PyObject * -do_conversion(PyObject *obj, Py_UCS4 conversion) -{ - /* XXX in pre-3.0, do we need to convert this to unicode, since it - might have returned a string? */ - switch (conversion) { - case 'r': - return PyObject_Repr(obj); - case 's': - return PyObject_Str(obj); - case 'a': - return PyObject_ASCII(obj); - default: - if (conversion > 32 && conversion < 127) { - /* It's the ASCII subrange; casting to char is safe - (assuming the execution character set is an ASCII - superset). */ - PyErr_Format(PyExc_ValueError, - "Unknown conversion specifier %c", - (char)conversion); - } else - PyErr_Format(PyExc_ValueError, - "Unknown conversion specifier \\x%x", - (unsigned int)conversion); - return NULL; - } -} - -/* given: - - {field_name!conversion:format_spec} - - compute the result and write it to output. - format_spec_needs_expanding is an optimization. if it's false, - just output the string directly, otherwise recursively expand the - format_spec string. - - field_name is allowed to be zero length, in which case we - are doing auto field numbering. -*/ - -static int -output_markup(SubString *field_name, SubString *format_spec, - int format_spec_needs_expanding, Py_UCS4 conversion, - _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs, - int recursion_depth, AutoNumber *auto_number) -{ - PyObject *tmp = NULL; - PyObject *fieldobj = NULL; - SubString expanded_format_spec; - SubString *actual_format_spec; - int result = 0; - - /* convert field_name to an object */ - fieldobj = get_field_object(field_name, args, kwargs, auto_number); - if (fieldobj == NULL) - goto done; - - if (conversion != '\0') { - tmp = do_conversion(fieldobj, conversion); - if (tmp == NULL || PyUnicode_READY(tmp) == -1) - goto done; - - /* do the assignment, transferring ownership: fieldobj = tmp */ - Py_DECREF(fieldobj); - fieldobj = tmp; - tmp = NULL; - } - + } + + /* iterate over the rest of the field_name */ + while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index, + &name)) == 2) { + PyObject *tmp; + + if (is_attribute) + /* getattr lookup "." */ + tmp = getattr(obj, &name); + else + /* getitem lookup "[]" */ + if (index == -1) + tmp = getitem_str(obj, &name); + else + if (PySequence_Check(obj)) + tmp = getitem_sequence(obj, index); + else + /* not a sequence */ + tmp = getitem_idx(obj, index); + if (tmp == NULL) + goto error; + + /* assign to obj */ + Py_DECREF(obj); + obj = tmp; + } + /* end of iterator, this is the non-error case */ + if (ok == 1) + return obj; +error: + Py_XDECREF(obj); + return NULL; +} + +/************************************************************************/ +/***************** Field rendering functions **************************/ +/************************************************************************/ + +/* + render_field() is the main function in this section. It takes the + field object and field specification string generated by + get_field_and_spec, and renders the field into the output string. + + render_field calls fieldobj.__format__(format_spec) method, and + appends to the output. +*/ +static int +render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer) +{ + int ok = 0; + PyObject *result = NULL; + PyObject *format_spec_object = NULL; + int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL; + int err; + + /* If we know the type exactly, skip the lookup of __format__ and just + call the formatter directly. */ + if (PyUnicode_CheckExact(fieldobj)) + formatter = _PyUnicode_FormatAdvancedWriter; + else if (PyLong_CheckExact(fieldobj)) + formatter = _PyLong_FormatAdvancedWriter; + else if (PyFloat_CheckExact(fieldobj)) + formatter = _PyFloat_FormatAdvancedWriter; + else if (PyComplex_CheckExact(fieldobj)) + formatter = _PyComplex_FormatAdvancedWriter; + + if (formatter) { + /* we know exactly which formatter will be called when __format__ is + looked up, so call it directly, instead. */ + err = formatter(writer, fieldobj, format_spec->str, + format_spec->start, format_spec->end); + return (err == 0); + } + else { + /* We need to create an object out of the pointers we have, because + __format__ takes a string/unicode object for format_spec. */ + if (format_spec->str) + format_spec_object = PyUnicode_Substring(format_spec->str, + format_spec->start, + format_spec->end); + else + format_spec_object = PyUnicode_New(0, 0); + if (format_spec_object == NULL) + goto done; + + result = PyObject_Format(fieldobj, format_spec_object); + } + if (result == NULL) + goto done; + + if (_PyUnicodeWriter_WriteStr(writer, result) == -1) + goto done; + ok = 1; + +done: + Py_XDECREF(format_spec_object); + Py_XDECREF(result); + return ok; +} + +static int +parse_field(SubString *str, SubString *field_name, SubString *format_spec, + int *format_spec_needs_expanding, Py_UCS4 *conversion) +{ + /* Note this function works if the field name is zero length, + which is good. Zero length field names are handled later, in + field_name_split. */ + + Py_UCS4 c = 0; + + /* initialize these, as they may be empty */ + *conversion = '\0'; + SubString_init(format_spec, NULL, 0, 0); + + /* Search for the field name. it's terminated by the end of + the string, or a ':' or '!' */ + field_name->str = str->str; + field_name->start = str->start; + while (str->start < str->end) { + switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { + case '{': + PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name"); + return 0; + case '[': + for (; str->start < str->end; str->start++) + if (PyUnicode_READ_CHAR(str->str, str->start) == ']') + break; + continue; + case '}': + case ':': + case '!': + break; + default: + continue; + } + break; + } + + field_name->end = str->start - 1; + if (c == '!' || c == ':') { + Py_ssize_t count; + /* we have a format specifier and/or a conversion */ + /* don't include the last character */ + + /* see if there's a conversion specifier */ + if (c == '!') { + /* there must be another character present */ + if (str->start >= str->end) { + PyErr_SetString(PyExc_ValueError, + "end of string while looking for conversion " + "specifier"); + return 0; + } + *conversion = PyUnicode_READ_CHAR(str->str, str->start++); + + if (str->start < str->end) { + c = PyUnicode_READ_CHAR(str->str, str->start++); + if (c == '}') + return 1; + if (c != ':') { + PyErr_SetString(PyExc_ValueError, + "expected ':' after conversion specifier"); + return 0; + } + } + } + format_spec->str = str->str; + format_spec->start = str->start; + count = 1; + while (str->start < str->end) { + switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { + case '{': + *format_spec_needs_expanding = 1; + count++; + break; + case '}': + count--; + if (count == 0) { + format_spec->end = str->start - 1; + return 1; + } + break; + default: + break; + } + } + + PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec"); + return 0; + } + else if (c != '}') { + PyErr_SetString(PyExc_ValueError, "expected '}' before end of string"); + return 0; + } + + return 1; +} + +/************************************************************************/ +/******* Output string allocation and escape-to-markup processing ******/ +/************************************************************************/ + +/* MarkupIterator breaks the string into pieces of either literal + text, or things inside {} that need to be marked up. it is + designed to make it easy to wrap a Python iterator around it, for + use with the Formatter class */ + +typedef struct { + SubString str; +} MarkupIterator; + +static int +MarkupIterator_init(MarkupIterator *self, PyObject *str, + Py_ssize_t start, Py_ssize_t end) +{ + SubString_init(&self->str, str, start, end); + return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it got a + string (or something to be expanded) */ +static int +MarkupIterator_next(MarkupIterator *self, SubString *literal, + int *field_present, SubString *field_name, + SubString *format_spec, Py_UCS4 *conversion, + int *format_spec_needs_expanding) +{ + int at_end; + Py_UCS4 c = 0; + Py_ssize_t start; + Py_ssize_t len; + int markup_follows = 0; + + /* initialize all of the output variables */ + SubString_init(literal, NULL, 0, 0); + SubString_init(field_name, NULL, 0, 0); + SubString_init(format_spec, NULL, 0, 0); + *conversion = '\0'; + *format_spec_needs_expanding = 0; + *field_present = 0; + + /* No more input, end of iterator. This is the normal exit + path. */ + if (self->str.start >= self->str.end) + return 1; + + start = self->str.start; + + /* First read any literal text. Read until the end of string, an + escaped '{' or '}', or an unescaped '{'. In order to never + allocate memory and so I can just pass pointers around, if + there's an escaped '{' or '}' then we'll return the literal + including the brace, but no format object. The next time + through, we'll return the rest of the literal, skipping past + the second consecutive brace. */ + while (self->str.start < self->str.end) { + switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) { + case '{': + case '}': + markup_follows = 1; + break; + default: + continue; + } + break; + } + + at_end = self->str.start >= self->str.end; + len = self->str.start - start; + + if ((c == '}') && (at_end || + (c != PyUnicode_READ_CHAR(self->str.str, + self->str.start)))) { + PyErr_SetString(PyExc_ValueError, "Single '}' encountered " + "in format string"); + return 0; + } + if (at_end && c == '{') { + PyErr_SetString(PyExc_ValueError, "Single '{' encountered " + "in format string"); + return 0; + } + if (!at_end) { + if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) { + /* escaped } or {, skip it in the input. there is no + markup object following us, just this literal text */ + self->str.start++; + markup_follows = 0; + } + else + len--; + } + + /* record the literal text */ + literal->str = self->str.str; + literal->start = start; + literal->end = start + len; + + if (!markup_follows) + return 2; + + /* this is markup; parse the field */ + *field_present = 1; + if (!parse_field(&self->str, field_name, format_spec, + format_spec_needs_expanding, conversion)) + return 0; + return 2; +} + + +/* do the !r or !s conversion on obj */ +static PyObject * +do_conversion(PyObject *obj, Py_UCS4 conversion) +{ + /* XXX in pre-3.0, do we need to convert this to unicode, since it + might have returned a string? */ + switch (conversion) { + case 'r': + return PyObject_Repr(obj); + case 's': + return PyObject_Str(obj); + case 'a': + return PyObject_ASCII(obj); + default: + if (conversion > 32 && conversion < 127) { + /* It's the ASCII subrange; casting to char is safe + (assuming the execution character set is an ASCII + superset). */ + PyErr_Format(PyExc_ValueError, + "Unknown conversion specifier %c", + (char)conversion); + } else + PyErr_Format(PyExc_ValueError, + "Unknown conversion specifier \\x%x", + (unsigned int)conversion); + return NULL; + } +} + +/* given: + + {field_name!conversion:format_spec} + + compute the result and write it to output. + format_spec_needs_expanding is an optimization. if it's false, + just output the string directly, otherwise recursively expand the + format_spec string. + + field_name is allowed to be zero length, in which case we + are doing auto field numbering. +*/ + +static int +output_markup(SubString *field_name, SubString *format_spec, + int format_spec_needs_expanding, Py_UCS4 conversion, + _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs, + int recursion_depth, AutoNumber *auto_number) +{ + PyObject *tmp = NULL; + PyObject *fieldobj = NULL; + SubString expanded_format_spec; + SubString *actual_format_spec; + int result = 0; + + /* convert field_name to an object */ + fieldobj = get_field_object(field_name, args, kwargs, auto_number); + if (fieldobj == NULL) + goto done; + + if (conversion != '\0') { + tmp = do_conversion(fieldobj, conversion); + if (tmp == NULL || PyUnicode_READY(tmp) == -1) + goto done; + + /* do the assignment, transferring ownership: fieldobj = tmp */ + Py_DECREF(fieldobj); + fieldobj = tmp; + tmp = NULL; + } + /* if needed, recursively compute the format_spec */ - if (format_spec_needs_expanding) { - tmp = build_string(format_spec, args, kwargs, recursion_depth-1, - auto_number); - if (tmp == NULL || PyUnicode_READY(tmp) == -1) - goto done; - - /* note that in the case we're expanding the format string, - tmp must be kept around until after the call to - render_field. */ - SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp)); - actual_format_spec = &expanded_format_spec; - } - else - actual_format_spec = format_spec; - - if (render_field(fieldobj, actual_format_spec, writer) == 0) - goto done; - - result = 1; - -done: - Py_XDECREF(fieldobj); - Py_XDECREF(tmp); - - return result; -} - -/* - do_markup is the top-level loop for the format() method. It - searches through the format string for escapes to markup codes, and - calls other functions to move non-markup text to the output, - and to perform the markup to the output. -*/ -static int -do_markup(SubString *input, PyObject *args, PyObject *kwargs, - _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number) -{ - MarkupIterator iter; - int format_spec_needs_expanding; - int result; - int field_present; - SubString literal; - SubString field_name; - SubString format_spec; - Py_UCS4 conversion; - - MarkupIterator_init(&iter, input->str, input->start, input->end); - while ((result = MarkupIterator_next(&iter, &literal, &field_present, - &field_name, &format_spec, - &conversion, - &format_spec_needs_expanding)) == 2) { - if (literal.end != literal.start) { - if (!field_present && iter.str.start == iter.str.end) - writer->overallocate = 0; - if (_PyUnicodeWriter_WriteSubstring(writer, literal.str, - literal.start, literal.end) < 0) - return 0; - } - - if (field_present) { - if (iter.str.start == iter.str.end) - writer->overallocate = 0; - if (!output_markup(&field_name, &format_spec, - format_spec_needs_expanding, conversion, writer, - args, kwargs, recursion_depth, auto_number)) - return 0; - } - } - return result; -} - - -/* - build_string allocates the output string and then - calls do_markup to do the heavy lifting. -*/ -static PyObject * -build_string(SubString *input, PyObject *args, PyObject *kwargs, - int recursion_depth, AutoNumber *auto_number) -{ - _PyUnicodeWriter writer; - - /* check the recursion level */ - if (recursion_depth <= 0) { - PyErr_SetString(PyExc_ValueError, - "Max string recursion exceeded"); - return NULL; - } - - _PyUnicodeWriter_Init(&writer); - writer.overallocate = 1; - writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100; - - if (!do_markup(input, args, kwargs, &writer, recursion_depth, - auto_number)) { - _PyUnicodeWriter_Dealloc(&writer); - return NULL; - } - - return _PyUnicodeWriter_Finish(&writer); -} - -/************************************************************************/ -/*********** main routine ***********************************************/ -/************************************************************************/ - -/* this is the main entry point */ -static PyObject * -do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) -{ - SubString input; - - /* PEP 3101 says only 2 levels, so that - "{0:{1}}".format('abc', 's') # works - "{0:{1:{2}}}".format('abc', 's', '') # fails - */ - int recursion_depth = 2; - - AutoNumber auto_number; - - if (PyUnicode_READY(self) == -1) - return NULL; - - AutoNumber_Init(&auto_number); - SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self)); - return build_string(&input, args, kwargs, recursion_depth, &auto_number); -} - -static PyObject * -do_string_format_map(PyObject *self, PyObject *obj) -{ - return do_string_format(self, NULL, obj); -} - - -/************************************************************************/ -/*********** formatteriterator ******************************************/ -/************************************************************************/ - -/* This is used to implement string.Formatter.vparse(). It exists so - Formatter can share code with the built in unicode.format() method. - It's really just a wrapper around MarkupIterator that is callable - from Python. */ - -typedef struct { - PyObject_HEAD - PyObject *str; - MarkupIterator it_markup; -} formatteriterobject; - -static void -formatteriter_dealloc(formatteriterobject *it) -{ - Py_XDECREF(it->str); - PyObject_FREE(it); -} - -/* returns a tuple: - (literal, field_name, format_spec, conversion) - - literal is any literal text to output. might be zero length - field_name is the string before the ':'. might be None - format_spec is the string after the ':'. mibht be None - conversion is either None, or the string after the '!' -*/ -static PyObject * -formatteriter_next(formatteriterobject *it) -{ - SubString literal; - SubString field_name; - SubString format_spec; - Py_UCS4 conversion; - int format_spec_needs_expanding; - int field_present; - int result = MarkupIterator_next(&it->it_markup, &literal, &field_present, - &field_name, &format_spec, &conversion, - &format_spec_needs_expanding); - - /* all of the SubString objects point into it->str, so no - memory management needs to be done on them */ - assert(0 <= result && result <= 2); - if (result == 0 || result == 1) - /* if 0, error has already been set, if 1, iterator is empty */ - return NULL; - else { - PyObject *literal_str = NULL; - PyObject *field_name_str = NULL; - PyObject *format_spec_str = NULL; - PyObject *conversion_str = NULL; - PyObject *tuple = NULL; - - literal_str = SubString_new_object(&literal); - if (literal_str == NULL) - goto done; - - field_name_str = SubString_new_object(&field_name); - if (field_name_str == NULL) - goto done; - - /* if field_name is non-zero length, return a string for - format_spec (even if zero length), else return None */ - format_spec_str = (field_present ? - SubString_new_object_or_empty : - SubString_new_object)(&format_spec); - if (format_spec_str == NULL) - goto done; - - /* if the conversion is not specified, return a None, - otherwise create a one length string with the conversion - character */ - if (conversion == '\0') { - conversion_str = Py_None; - Py_INCREF(conversion_str); - } - else - conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, - &conversion, 1); - if (conversion_str == NULL) - goto done; - - tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str, - conversion_str); - done: - Py_XDECREF(literal_str); - Py_XDECREF(field_name_str); - Py_XDECREF(format_spec_str); - Py_XDECREF(conversion_str); - return tuple; - } -} - -static PyMethodDef formatteriter_methods[] = { - {NULL, NULL} /* sentinel */ -}; - -static PyTypeObject PyFormatterIter_Type = { - PyVarObject_HEAD_INIT(&PyType_Type, 0) - "formatteriterator", /* tp_name */ - sizeof(formatteriterobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - /* methods */ - (destructor)formatteriter_dealloc, /* tp_dealloc */ + if (format_spec_needs_expanding) { + tmp = build_string(format_spec, args, kwargs, recursion_depth-1, + auto_number); + if (tmp == NULL || PyUnicode_READY(tmp) == -1) + goto done; + + /* note that in the case we're expanding the format string, + tmp must be kept around until after the call to + render_field. */ + SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp)); + actual_format_spec = &expanded_format_spec; + } + else + actual_format_spec = format_spec; + + if (render_field(fieldobj, actual_format_spec, writer) == 0) + goto done; + + result = 1; + +done: + Py_XDECREF(fieldobj); + Py_XDECREF(tmp); + + return result; +} + +/* + do_markup is the top-level loop for the format() method. It + searches through the format string for escapes to markup codes, and + calls other functions to move non-markup text to the output, + and to perform the markup to the output. +*/ +static int +do_markup(SubString *input, PyObject *args, PyObject *kwargs, + _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number) +{ + MarkupIterator iter; + int format_spec_needs_expanding; + int result; + int field_present; + SubString literal; + SubString field_name; + SubString format_spec; + Py_UCS4 conversion; + + MarkupIterator_init(&iter, input->str, input->start, input->end); + while ((result = MarkupIterator_next(&iter, &literal, &field_present, + &field_name, &format_spec, + &conversion, + &format_spec_needs_expanding)) == 2) { + if (literal.end != literal.start) { + if (!field_present && iter.str.start == iter.str.end) + writer->overallocate = 0; + if (_PyUnicodeWriter_WriteSubstring(writer, literal.str, + literal.start, literal.end) < 0) + return 0; + } + + if (field_present) { + if (iter.str.start == iter.str.end) + writer->overallocate = 0; + if (!output_markup(&field_name, &format_spec, + format_spec_needs_expanding, conversion, writer, + args, kwargs, recursion_depth, auto_number)) + return 0; + } + } + return result; +} + + +/* + build_string allocates the output string and then + calls do_markup to do the heavy lifting. +*/ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int recursion_depth, AutoNumber *auto_number) +{ + _PyUnicodeWriter writer; + + /* check the recursion level */ + if (recursion_depth <= 0) { + PyErr_SetString(PyExc_ValueError, + "Max string recursion exceeded"); + return NULL; + } + + _PyUnicodeWriter_Init(&writer); + writer.overallocate = 1; + writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100; + + if (!do_markup(input, args, kwargs, &writer, recursion_depth, + auto_number)) { + _PyUnicodeWriter_Dealloc(&writer); + return NULL; + } + + return _PyUnicodeWriter_Finish(&writer); +} + +/************************************************************************/ +/*********** main routine ***********************************************/ +/************************************************************************/ + +/* this is the main entry point */ +static PyObject * +do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) +{ + SubString input; + + /* PEP 3101 says only 2 levels, so that + "{0:{1}}".format('abc', 's') # works + "{0:{1:{2}}}".format('abc', 's', '') # fails + */ + int recursion_depth = 2; + + AutoNumber auto_number; + + if (PyUnicode_READY(self) == -1) + return NULL; + + AutoNumber_Init(&auto_number); + SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self)); + return build_string(&input, args, kwargs, recursion_depth, &auto_number); +} + +static PyObject * +do_string_format_map(PyObject *self, PyObject *obj) +{ + return do_string_format(self, NULL, obj); +} + + +/************************************************************************/ +/*********** formatteriterator ******************************************/ +/************************************************************************/ + +/* This is used to implement string.Formatter.vparse(). It exists so + Formatter can share code with the built in unicode.format() method. + It's really just a wrapper around MarkupIterator that is callable + from Python. */ + +typedef struct { + PyObject_HEAD + PyObject *str; + MarkupIterator it_markup; +} formatteriterobject; + +static void +formatteriter_dealloc(formatteriterobject *it) +{ + Py_XDECREF(it->str); + PyObject_FREE(it); +} + +/* returns a tuple: + (literal, field_name, format_spec, conversion) + + literal is any literal text to output. might be zero length + field_name is the string before the ':'. might be None + format_spec is the string after the ':'. mibht be None + conversion is either None, or the string after the '!' +*/ +static PyObject * +formatteriter_next(formatteriterobject *it) +{ + SubString literal; + SubString field_name; + SubString format_spec; + Py_UCS4 conversion; + int format_spec_needs_expanding; + int field_present; + int result = MarkupIterator_next(&it->it_markup, &literal, &field_present, + &field_name, &format_spec, &conversion, + &format_spec_needs_expanding); + + /* all of the SubString objects point into it->str, so no + memory management needs to be done on them */ + assert(0 <= result && result <= 2); + if (result == 0 || result == 1) + /* if 0, error has already been set, if 1, iterator is empty */ + return NULL; + else { + PyObject *literal_str = NULL; + PyObject *field_name_str = NULL; + PyObject *format_spec_str = NULL; + PyObject *conversion_str = NULL; + PyObject *tuple = NULL; + + literal_str = SubString_new_object(&literal); + if (literal_str == NULL) + goto done; + + field_name_str = SubString_new_object(&field_name); + if (field_name_str == NULL) + goto done; + + /* if field_name is non-zero length, return a string for + format_spec (even if zero length), else return None */ + format_spec_str = (field_present ? + SubString_new_object_or_empty : + SubString_new_object)(&format_spec); + if (format_spec_str == NULL) + goto done; + + /* if the conversion is not specified, return a None, + otherwise create a one length string with the conversion + character */ + if (conversion == '\0') { + conversion_str = Py_None; + Py_INCREF(conversion_str); + } + else + conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + &conversion, 1); + if (conversion_str == NULL) + goto done; + + tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str, + conversion_str); + done: + Py_XDECREF(literal_str); + Py_XDECREF(field_name_str); + Py_XDECREF(format_spec_str); + Py_XDECREF(conversion_str); + return tuple; + } +} + +static PyMethodDef formatteriter_methods[] = { + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject PyFormatterIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "formatteriterator", /* tp_name */ + sizeof(formatteriterobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)formatteriter_dealloc, /* tp_dealloc */ 0, /* tp_vectorcall_offset */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ 0, /* tp_as_async */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - 0, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - PyObject_SelfIter, /* tp_iter */ - (iternextfunc)formatteriter_next, /* tp_iternext */ - formatteriter_methods, /* tp_methods */ - 0, -}; - -/* unicode_formatter_parser is used to implement - string.Formatter.vformat. it parses a string and returns tuples - describing the parsed elements. It's a wrapper around - stringlib/string_format.h's MarkupIterator */ -static PyObject * -formatter_parser(PyObject *ignored, PyObject *self) -{ - formatteriterobject *it; - - if (!PyUnicode_Check(self)) { - PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); - return NULL; - } - - if (PyUnicode_READY(self) == -1) - return NULL; - - it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); - if (it == NULL) - return NULL; - - /* take ownership, give the object to the iterator */ - Py_INCREF(self); - it->str = self; - - /* initialize the contained MarkupIterator */ - MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self)); - return (PyObject *)it; -} - - -/************************************************************************/ -/*********** fieldnameiterator ******************************************/ -/************************************************************************/ - - -/* This is used to implement string.Formatter.vparse(). It parses the - field name into attribute and item values. It's a Python-callable - wrapper around FieldNameIterator */ - -typedef struct { - PyObject_HEAD - PyObject *str; - FieldNameIterator it_field; -} fieldnameiterobject; - -static void -fieldnameiter_dealloc(fieldnameiterobject *it) -{ - Py_XDECREF(it->str); - PyObject_FREE(it); -} - -/* returns a tuple: - (is_attr, value) - is_attr is true if we used attribute syntax (e.g., '.foo') - false if we used index syntax (e.g., '[foo]') - value is an integer or string -*/ -static PyObject * -fieldnameiter_next(fieldnameiterobject *it) -{ - int result; - int is_attr; - Py_ssize_t idx; - SubString name; - - result = FieldNameIterator_next(&it->it_field, &is_attr, - &idx, &name); - if (result == 0 || result == 1) - /* if 0, error has already been set, if 1, iterator is empty */ - return NULL; - else { - PyObject* result = NULL; - PyObject* is_attr_obj = NULL; - PyObject* obj = NULL; - - is_attr_obj = PyBool_FromLong(is_attr); - if (is_attr_obj == NULL) - goto done; - - /* either an integer or a string */ - if (idx != -1) - obj = PyLong_FromSsize_t(idx); - else - obj = SubString_new_object(&name); - if (obj == NULL) - goto done; - - /* return a tuple of values */ - result = PyTuple_Pack(2, is_attr_obj, obj); - - done: - Py_XDECREF(is_attr_obj); - Py_XDECREF(obj); - return result; - } -} - -static PyMethodDef fieldnameiter_methods[] = { - {NULL, NULL} /* sentinel */ -}; - -static PyTypeObject PyFieldNameIter_Type = { - PyVarObject_HEAD_INIT(&PyType_Type, 0) - "fieldnameiterator", /* tp_name */ - sizeof(fieldnameiterobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - /* methods */ - (destructor)fieldnameiter_dealloc, /* tp_dealloc */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)formatteriter_next, /* tp_iternext */ + formatteriter_methods, /* tp_methods */ + 0, +}; + +/* unicode_formatter_parser is used to implement + string.Formatter.vformat. it parses a string and returns tuples + describing the parsed elements. It's a wrapper around + stringlib/string_format.h's MarkupIterator */ +static PyObject * +formatter_parser(PyObject *ignored, PyObject *self) +{ + formatteriterobject *it; + + if (!PyUnicode_Check(self)) { + PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); + return NULL; + } + + if (PyUnicode_READY(self) == -1) + return NULL; + + it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); + if (it == NULL) + return NULL; + + /* take ownership, give the object to the iterator */ + Py_INCREF(self); + it->str = self; + + /* initialize the contained MarkupIterator */ + MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self)); + return (PyObject *)it; +} + + +/************************************************************************/ +/*********** fieldnameiterator ******************************************/ +/************************************************************************/ + + +/* This is used to implement string.Formatter.vparse(). It parses the + field name into attribute and item values. It's a Python-callable + wrapper around FieldNameIterator */ + +typedef struct { + PyObject_HEAD + PyObject *str; + FieldNameIterator it_field; +} fieldnameiterobject; + +static void +fieldnameiter_dealloc(fieldnameiterobject *it) +{ + Py_XDECREF(it->str); + PyObject_FREE(it); +} + +/* returns a tuple: + (is_attr, value) + is_attr is true if we used attribute syntax (e.g., '.foo') + false if we used index syntax (e.g., '[foo]') + value is an integer or string +*/ +static PyObject * +fieldnameiter_next(fieldnameiterobject *it) +{ + int result; + int is_attr; + Py_ssize_t idx; + SubString name; + + result = FieldNameIterator_next(&it->it_field, &is_attr, + &idx, &name); + if (result == 0 || result == 1) + /* if 0, error has already been set, if 1, iterator is empty */ + return NULL; + else { + PyObject* result = NULL; + PyObject* is_attr_obj = NULL; + PyObject* obj = NULL; + + is_attr_obj = PyBool_FromLong(is_attr); + if (is_attr_obj == NULL) + goto done; + + /* either an integer or a string */ + if (idx != -1) + obj = PyLong_FromSsize_t(idx); + else + obj = SubString_new_object(&name); + if (obj == NULL) + goto done; + + /* return a tuple of values */ + result = PyTuple_Pack(2, is_attr_obj, obj); + + done: + Py_XDECREF(is_attr_obj); + Py_XDECREF(obj); + return result; + } +} + +static PyMethodDef fieldnameiter_methods[] = { + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject PyFieldNameIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "fieldnameiterator", /* tp_name */ + sizeof(fieldnameiterobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)fieldnameiter_dealloc, /* tp_dealloc */ 0, /* tp_vectorcall_offset */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ 0, /* tp_as_async */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - 0, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - PyObject_SelfIter, /* tp_iter */ - (iternextfunc)fieldnameiter_next, /* tp_iternext */ - fieldnameiter_methods, /* tp_methods */ - 0}; - -/* unicode_formatter_field_name_split is used to implement - string.Formatter.vformat. it takes a PEP 3101 "field name", and - returns a tuple of (first, rest): "first", the part before the - first '.' or '['; and "rest", an iterator for the rest of the field - name. it's a wrapper around stringlib/string_format.h's - field_name_split. The iterator it returns is a - FieldNameIterator */ -static PyObject * -formatter_field_name_split(PyObject *ignored, PyObject *self) -{ - SubString first; - Py_ssize_t first_idx; - fieldnameiterobject *it; - - PyObject *first_obj = NULL; - PyObject *result = NULL; - - if (!PyUnicode_Check(self)) { - PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); - return NULL; - } - - if (PyUnicode_READY(self) == -1) - return NULL; - - it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); - if (it == NULL) - return NULL; - - /* take ownership, give the object to the iterator. this is - just to keep the field_name alive */ - Py_INCREF(self); - it->str = self; - - /* Pass in auto_number = NULL. We'll return an empty string for - first_obj in that case. */ - if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self), - &first, &first_idx, &it->it_field, NULL)) - goto done; - - /* first becomes an integer, if possible; else a string */ - if (first_idx != -1) - first_obj = PyLong_FromSsize_t(first_idx); - else - /* convert "first" into a string object */ - first_obj = SubString_new_object(&first); - if (first_obj == NULL) - goto done; - - /* return a tuple of values */ - result = PyTuple_Pack(2, first_obj, it); - -done: - Py_XDECREF(it); - Py_XDECREF(first_obj); - return result; -} + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)fieldnameiter_next, /* tp_iternext */ + fieldnameiter_methods, /* tp_methods */ + 0}; + +/* unicode_formatter_field_name_split is used to implement + string.Formatter.vformat. it takes a PEP 3101 "field name", and + returns a tuple of (first, rest): "first", the part before the + first '.' or '['; and "rest", an iterator for the rest of the field + name. it's a wrapper around stringlib/string_format.h's + field_name_split. The iterator it returns is a + FieldNameIterator */ +static PyObject * +formatter_field_name_split(PyObject *ignored, PyObject *self) +{ + SubString first; + Py_ssize_t first_idx; + fieldnameiterobject *it; + + PyObject *first_obj = NULL; + PyObject *result = NULL; + + if (!PyUnicode_Check(self)) { + PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); + return NULL; + } + + if (PyUnicode_READY(self) == -1) + return NULL; + + it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); + if (it == NULL) + return NULL; + + /* take ownership, give the object to the iterator. this is + just to keep the field_name alive */ + Py_INCREF(self); + it->str = self; + + /* Pass in auto_number = NULL. We'll return an empty string for + first_obj in that case. */ + if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self), + &first, &first_idx, &it->it_field, NULL)) + goto done; + + /* first becomes an integer, if possible; else a string */ + if (first_idx != -1) + first_obj = PyLong_FromSsize_t(first_idx); + else + /* convert "first" into a string object */ + first_obj = SubString_new_object(&first); + if (first_obj == NULL) + goto done; + + /* return a tuple of values */ + result = PyTuple_Pack(2, first_obj, it); + +done: + Py_XDECREF(it); + Py_XDECREF(first_obj); + return result; +} diff --git a/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h b/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h index f9ab32796e..3db5629e11 100644 --- a/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h +++ b/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h @@ -1,32 +1,32 @@ -#ifndef STRINGLIB_UNICODEDEFS_H -#define STRINGLIB_UNICODEDEFS_H - -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH fastsearch -#define STRINGLIB(F) stringlib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR Py_UNICODE_SIZE -#define STRINGLIB_CHAR Py_UNICODE -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_AS_UNICODE -#define STRINGLIB_LEN PyUnicode_GET_SIZE -#define STRINGLIB_NEW PyUnicode_FromUnicode -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII - -#define STRINGLIB_WANT_CONTAINS_OBJ 1 - -#endif /* !STRINGLIB_UNICODEDEFS_H */ +#ifndef STRINGLIB_UNICODEDEFS_H +#define STRINGLIB_UNICODEDEFS_H + +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define FASTSEARCH fastsearch +#define STRINGLIB(F) stringlib_##F +#define STRINGLIB_OBJECT PyUnicodeObject +#define STRINGLIB_SIZEOF_CHAR Py_UNICODE_SIZE +#define STRINGLIB_CHAR Py_UNICODE +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_PARSE_CODE "U" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_STR PyUnicode_AS_UNICODE +#define STRINGLIB_LEN PyUnicode_GET_SIZE +#define STRINGLIB_NEW PyUnicode_FromUnicode +#define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact + +#define STRINGLIB_TOSTR PyObject_Str +#define STRINGLIB_TOASCII PyObject_ASCII + +#define STRINGLIB_WANT_CONTAINS_OBJ 1 + +#endif /* !STRINGLIB_UNICODEDEFS_H */ |