aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/src/Objects/stringlib
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.ru>2022-04-18 12:39:32 +0300
committershadchin <shadchin@yandex-team.ru>2022-04-18 12:39:32 +0300
commitd4be68e361f4258cf0848fc70018dfe37a2acc24 (patch)
tree153e294cd97ac8b5d7a989612704a0c1f58e8ad4 /contrib/tools/python3/src/Objects/stringlib
parent260c02f5ccf242d9d9b8a873afaf6588c00237d6 (diff)
downloadydb-d4be68e361f4258cf0848fc70018dfe37a2acc24.tar.gz
IGNIETFERRO-1816 Update Python 3 from 3.9.12 to 3.10.4
ref:9f96be6d02ee8044fdd6f124b799b270c20ce641
Diffstat (limited to 'contrib/tools/python3/src/Objects/stringlib')
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/asciilib.h1
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/clinic/transmogrify.h.h35
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/codecs.h39
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/fastsearch.h490
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/find_max_char.h22
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/join.h2
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/partition.h27
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/stringdefs.h1
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/ucs1lib.h1
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/ucs2lib.h1
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/ucs4lib.h1
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/unicode_format.h4
-rw-r--r--contrib/tools/python3/src/Objects/stringlib/unicodedefs.h1
13 files changed, 524 insertions, 101 deletions
diff --git a/contrib/tools/python3/src/Objects/stringlib/asciilib.h b/contrib/tools/python3/src/Objects/stringlib/asciilib.h
index e69a2c076e..7749e8fb33 100644
--- a/contrib/tools/python3/src/Objects/stringlib/asciilib.h
+++ b/contrib/tools/python3/src/Objects/stringlib/asciilib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_EMPTY unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/contrib/tools/python3/src/Objects/stringlib/clinic/transmogrify.h.h b/contrib/tools/python3/src/Objects/stringlib/clinic/transmogrify.h.h
index 8a3a060f12..a5135a0cba 100644
--- a/contrib/tools/python3/src/Objects/stringlib/clinic/transmogrify.h.h
+++ b/contrib/tools/python3/src/Objects/stringlib/clinic/transmogrify.h.h
@@ -33,11 +33,6 @@ stringlib_expandtabs(PyObject *self, PyObject *const *args, Py_ssize_t nargs, Py
if (!noptargs) {
goto skip_optional_pos;
}
- if (PyFloat_Check(args[0])) {
- PyErr_SetString(PyExc_TypeError,
- "integer argument expected, got float" );
- goto exit;
- }
tabsize = _PyLong_AsInt(args[0]);
if (tabsize == -1 && PyErr_Occurred()) {
goto exit;
@@ -73,14 +68,9 @@ stringlib_ljust(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
if (!_PyArg_CheckPositional("ljust", nargs, 1, 2)) {
goto exit;
}
- if (PyFloat_Check(args[0])) {
- PyErr_SetString(PyExc_TypeError,
- "integer argument expected, got float" );
- goto exit;
- }
{
Py_ssize_t ival = -1;
- PyObject *iobj = PyNumber_Index(args[0]);
+ PyObject *iobj = _PyNumber_Index(args[0]);
if (iobj != NULL) {
ival = PyLong_AsSsize_t(iobj);
Py_DECREF(iobj);
@@ -134,14 +124,9 @@ stringlib_rjust(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
if (!_PyArg_CheckPositional("rjust", nargs, 1, 2)) {
goto exit;
}
- if (PyFloat_Check(args[0])) {
- PyErr_SetString(PyExc_TypeError,
- "integer argument expected, got float" );
- goto exit;
- }
{
Py_ssize_t ival = -1;
- PyObject *iobj = PyNumber_Index(args[0]);
+ PyObject *iobj = _PyNumber_Index(args[0]);
if (iobj != NULL) {
ival = PyLong_AsSsize_t(iobj);
Py_DECREF(iobj);
@@ -195,14 +180,9 @@ stringlib_center(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
if (!_PyArg_CheckPositional("center", nargs, 1, 2)) {
goto exit;
}
- if (PyFloat_Check(args[0])) {
- PyErr_SetString(PyExc_TypeError,
- "integer argument expected, got float" );
- goto exit;
- }
{
Py_ssize_t ival = -1;
- PyObject *iobj = PyNumber_Index(args[0]);
+ PyObject *iobj = _PyNumber_Index(args[0]);
if (iobj != NULL) {
ival = PyLong_AsSsize_t(iobj);
Py_DECREF(iobj);
@@ -252,14 +232,9 @@ stringlib_zfill(PyObject *self, PyObject *arg)
PyObject *return_value = NULL;
Py_ssize_t width;
- if (PyFloat_Check(arg)) {
- PyErr_SetString(PyExc_TypeError,
- "integer argument expected, got float" );
- goto exit;
- }
{
Py_ssize_t ival = -1;
- PyObject *iobj = PyNumber_Index(arg);
+ PyObject *iobj = _PyNumber_Index(arg);
if (iobj != NULL) {
ival = PyLong_AsSsize_t(iobj);
Py_DECREF(iobj);
@@ -274,4 +249,4 @@ stringlib_zfill(PyObject *self, PyObject *arg)
exit:
return return_value;
}
-/*[clinic end generated code: output=15be047aef999b4e input=a9049054013a1b77]*/
+/*[clinic end generated code: output=2d9abc7b1cffeca6 input=a9049054013a1b77]*/
diff --git a/contrib/tools/python3/src/Objects/stringlib/codecs.h b/contrib/tools/python3/src/Objects/stringlib/codecs.h
index 9b2a29ba3b..b17cda18f5 100644
--- a/contrib/tools/python3/src/Objects/stringlib/codecs.h
+++ b/contrib/tools/python3/src/Objects/stringlib/codecs.h
@@ -4,16 +4,16 @@
# error "codecs.h is specific to Unicode"
#endif
-#include "pycore_byteswap.h" // _Py_bswap32()
+#include "pycore_bitutils.h" // _Py_bswap32()
-/* Mask to quickly check whether a C 'long' contains a
+/* Mask to quickly check whether a C 'size_t' contains a
non-ASCII, UTF8-encoded char. */
-#if (SIZEOF_LONG == 8)
-# define ASCII_CHAR_MASK 0x8080808080808080UL
-#elif (SIZEOF_LONG == 4)
-# define ASCII_CHAR_MASK 0x80808080UL
+#if (SIZEOF_SIZE_T == 8)
+# define ASCII_CHAR_MASK 0x8080808080808080ULL
+#elif (SIZEOF_SIZE_T == 4)
+# define ASCII_CHAR_MASK 0x80808080U
#else
-# error C 'long' size should be either 4 or 8!
+# error C 'size_t' size should be either 4 or 8!
#endif
/* 10xxxxxx */
@@ -26,7 +26,6 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
{
Py_UCS4 ch;
const char *s = *inptr;
- const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
STRINGLIB_CHAR *p = dest + *outpos;
while (s < end) {
@@ -36,19 +35,19 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
/* Fast path for runs of ASCII characters. Given that common UTF-8
input will consist of an overwhelming majority of ASCII
characters, we try to optimize for this case by checking
- as many characters as a C 'long' can contain.
+ as many characters as a C 'size_t' can contain.
First, check if we can do an aligned read, as most CPUs have
a penalty for unaligned reads.
*/
- if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
+ if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
/* Help register allocation */
const char *_s = s;
STRINGLIB_CHAR *_p = p;
- while (_s < aligned_end) {
- /* Read a whole long at a time (either 4 or 8 bytes),
+ while (_s + SIZEOF_SIZE_T <= end) {
+ /* Read a whole size_t at a time (either 4 or 8 bytes),
and do a fast unrolled copy if it only contains ASCII
characters. */
- unsigned long value = *(const unsigned long *) _s;
+ size_t value = *(const size_t *) _s;
if (value & ASCII_CHAR_MASK)
break;
#if PY_LITTLE_ENDIAN
@@ -56,14 +55,14 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
_p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
_p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
-# if SIZEOF_LONG == 8
+# if SIZEOF_SIZE_T == 8
_p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
_p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
_p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
_p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
# endif
#else
-# if SIZEOF_LONG == 8
+# if SIZEOF_SIZE_T == 8
_p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
_p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
@@ -79,8 +78,8 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
_p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
# endif
#endif
- _s += SIZEOF_LONG;
- _p += SIZEOF_LONG;
+ _s += SIZEOF_SIZE_T;
+ _p += SIZEOF_SIZE_T;
}
s = _s;
p = _p;
@@ -496,8 +495,6 @@ STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
int native_ordering)
{
Py_UCS4 ch;
- const unsigned char *aligned_end =
- (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
const unsigned char *q = *inptr;
STRINGLIB_CHAR *p = dest + *outpos;
/* Offsets from q for retrieving byte pairs in the right order. */
@@ -512,10 +509,10 @@ STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
Py_UCS4 ch2;
/* First check for possible aligned read of a C 'long'. Unaligned
reads are more expensive, better to defer to another iteration. */
- if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
+ if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) {
/* Fast path for runs of in-range non-surrogate chars. */
const unsigned char *_q = q;
- while (_q < aligned_end) {
+ while (_q + SIZEOF_LONG <= e) {
unsigned long block = * (const unsigned long *) _q;
if (native_ordering) {
/* Can use buffer directly */
diff --git a/contrib/tools/python3/src/Objects/stringlib/fastsearch.h b/contrib/tools/python3/src/Objects/stringlib/fastsearch.h
index 56a4467d35..6574720b60 100644
--- a/contrib/tools/python3/src/Objects/stringlib/fastsearch.h
+++ b/contrib/tools/python3/src/Objects/stringlib/fastsearch.h
@@ -9,10 +9,16 @@
/* note: fastsearch may access s[n], which isn't a problem when using
Python's ordinary string types, but may cause problems if you're
using this code in other contexts. also, the count mode returns -1
- if there cannot possible be a match in the target string, and 0 if
+ if there cannot possibly be a match in the target string, and 0 if
it has actually checked for matches, but didn't find any. callers
beware! */
+/* If the strings are long enough, use Crochemore and Perrin's Two-Way
+ algorithm, which has worst-case O(n) runtime and best-case O(n/k).
+ Also compute a table of shifts to achieve O(n/k) in more cases,
+ and often (data dependent) deduce larger shifts than pure C&P can
+ deduce. */
+
#define FAST_COUNT 0
#define FAST_SEARCH 1
#define FAST_RSEARCH 2
@@ -160,6 +166,353 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
#undef MEMCHR_CUT_OFF
+/* Change to a 1 to see logging comments walk through the algorithm. */
+#if 0 && STRINGLIB_SIZEOF_CHAR == 1
+# define LOG(...) printf(__VA_ARGS__)
+# define LOG_STRING(s, n) printf("\"%.*s\"", n, s)
+#else
+# define LOG(...)
+# define LOG_STRING(s, n)
+#endif
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
+ Py_ssize_t *return_period, int invert_alphabet)
+{
+ /* Do a lexicographic search. Essentially this:
+ >>> max(needle[i:] for i in range(len(needle)+1))
+ Also find the period of the right half. */
+ Py_ssize_t max_suffix = 0;
+ Py_ssize_t candidate = 1;
+ Py_ssize_t k = 0;
+ // The period of the right half.
+ Py_ssize_t period = 1;
+
+ while (candidate + k < len_needle) {
+ // each loop increases candidate + k + max_suffix
+ STRINGLIB_CHAR a = needle[candidate + k];
+ STRINGLIB_CHAR b = needle[max_suffix + k];
+ // check if the suffix at candidate is better than max_suffix
+ if (invert_alphabet ? (b < a) : (a < b)) {
+ // Fell short of max_suffix.
+ // The next k + 1 characters are non-increasing
+ // from candidate, so they won't start a maximal suffix.
+ candidate += k + 1;
+ k = 0;
+ // We've ruled out any period smaller than what's
+ // been scanned since max_suffix.
+ period = candidate - max_suffix;
+ }
+ else if (a == b) {
+ if (k + 1 != period) {
+ // Keep scanning the equal strings
+ k++;
+ }
+ else {
+ // Matched a whole period.
+ // Start matching the next period.
+ candidate += period;
+ k = 0;
+ }
+ }
+ else {
+ // Did better than max_suffix, so replace it.
+ max_suffix = candidate;
+ candidate++;
+ k = 0;
+ period = 1;
+ }
+ }
+ *return_period = period;
+ return max_suffix;
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle,
+ Py_ssize_t len_needle,
+ Py_ssize_t *return_period)
+{
+ /* Do a "critical factorization", making it so that:
+ >>> needle = (left := needle[:cut]) + (right := needle[cut:])
+ where the "local period" of the cut is maximal.
+
+ The local period of the cut is the minimal length of a string w
+ such that (left endswith w or w endswith left)
+ and (right startswith w or w startswith left).
+
+ The Critical Factorization Theorem says that this maximal local
+ period is the global period of the string.
+
+ Crochemore and Perrin (1991) show that this cut can be computed
+ as the later of two cuts: one that gives a lexicographically
+ maximal right half, and one that gives the same with the
+ with respect to a reversed alphabet-ordering.
+
+ This is what we want to happen:
+ >>> x = "GCAGAGAG"
+ >>> cut, period = factorize(x)
+ >>> x[:cut], (right := x[cut:])
+ ('GC', 'AGAGAG')
+ >>> period # right half period
+ 2
+ >>> right[period:] == right[:-period]
+ True
+
+ This is how the local period lines up in the above example:
+ GC | AGAGAG
+ AGAGAGC = AGAGAGC
+ The length of this minimal repetition is 7, which is indeed the
+ period of the original string. */
+
+ Py_ssize_t cut1, period1, cut2, period2, cut, period;
+ cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0);
+ cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1);
+
+ // Take the later cut.
+ if (cut1 > cut2) {
+ period = period1;
+ cut = cut1;
+ }
+ else {
+ period = period2;
+ cut = cut2;
+ }
+
+ LOG("split: "); LOG_STRING(needle, cut);
+ LOG(" + "); LOG_STRING(needle + cut, len_needle - cut);
+ LOG("\n");
+
+ *return_period = period;
+ return cut;
+}
+
+#define SHIFT_TYPE uint8_t
+#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U)
+#define SHIFT_OVERFLOW (NOT_FOUND - 1U)
+
+#define TABLE_SIZE_BITS 6
+#define TABLE_SIZE (1U << TABLE_SIZE_BITS)
+#define TABLE_MASK (TABLE_SIZE - 1U)
+
+typedef struct STRINGLIB(_pre) {
+ const STRINGLIB_CHAR *needle;
+ Py_ssize_t len_needle;
+ Py_ssize_t cut;
+ Py_ssize_t period;
+ int is_periodic;
+ SHIFT_TYPE table[TABLE_SIZE];
+} STRINGLIB(prework);
+
+
+Py_LOCAL_INLINE(void)
+STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
+ STRINGLIB(prework) *p)
+{
+ p->needle = needle;
+ p->len_needle = len_needle;
+ p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period));
+ assert(p->period + p->cut <= len_needle);
+ p->is_periodic = (0 == memcmp(needle,
+ needle + p->period,
+ p->cut * STRINGLIB_SIZEOF_CHAR));
+ if (p->is_periodic) {
+ assert(p->cut <= len_needle/2);
+ assert(p->cut < p->period);
+ }
+ else {
+ // A lower bound on the period
+ p->period = Py_MAX(p->cut, len_needle - p->cut) + 1;
+ }
+ // Now fill up a table
+ memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE));
+ assert(p->table[0] == NOT_FOUND);
+ assert(p->table[TABLE_MASK] == NOT_FOUND);
+ for (Py_ssize_t i = 0; i < len_needle; i++) {
+ Py_ssize_t shift = len_needle - i;
+ if (shift > SHIFT_OVERFLOW) {
+ shift = SHIFT_OVERFLOW;
+ }
+ p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift,
+ Py_ssize_t,
+ SHIFT_TYPE);
+ }
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
+ STRINGLIB(prework) *p)
+{
+ // Crochemore and Perrin's (1991) Two-Way algorithm.
+ // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
+ Py_ssize_t len_needle = p->len_needle;
+ Py_ssize_t cut = p->cut;
+ Py_ssize_t period = p->period;
+ const STRINGLIB_CHAR *needle = p->needle;
+ const STRINGLIB_CHAR *window = haystack;
+ const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle;
+ SHIFT_TYPE *table = p->table;
+ LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack);
+
+ if (p->is_periodic) {
+ LOG("Needle is periodic.\n");
+ Py_ssize_t memory = 0;
+ periodicwindowloop:
+ while (window <= last_window) {
+ Py_ssize_t i = Py_MAX(cut, memory);
+
+ // Visualize the line-up:
+ LOG("> "); LOG_STRING(haystack, len_haystack);
+ LOG("\n> "); LOG("%*s", window - haystack, "");
+ LOG_STRING(needle, len_needle);
+ LOG("\n> "); LOG("%*s", window - haystack + i, "");
+ LOG(" ^ <-- cut\n");
+
+ if (window[i] != needle[i]) {
+ // Sunday's trick: if we're going to jump, we might
+ // as well jump to line up the character *after* the
+ // current window.
+ STRINGLIB_CHAR first_outside = window[len_needle];
+ SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
+ if (shift == NOT_FOUND) {
+ LOG("\"%c\" not found. Skipping entirely.\n",
+ first_outside);
+ window += len_needle + 1;
+ }
+ else {
+ LOG("Shifting to line up \"%c\".\n", first_outside);
+ Py_ssize_t memory_shift = i - cut + 1;
+ window += Py_MAX(shift, memory_shift);
+ }
+ memory = 0;
+ goto periodicwindowloop;
+ }
+ for (i = i + 1; i < len_needle; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Right half does not match. Jump ahead by %d.\n",
+ i - cut + 1);
+ window += i - cut + 1;
+ memory = 0;
+ goto periodicwindowloop;
+ }
+ }
+ for (i = memory; i < cut; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Left half does not match. Jump ahead by period %d.\n",
+ period);
+ window += period;
+ memory = len_needle - period;
+ goto periodicwindowloop;
+ }
+ }
+ LOG("Left half matches. Returning %d.\n",
+ window - haystack);
+ return window - haystack;
+ }
+ }
+ else {
+ LOG("Needle is not periodic.\n");
+ assert(cut < len_needle);
+ STRINGLIB_CHAR needle_cut = needle[cut];
+ windowloop:
+ while (window <= last_window) {
+
+ // Visualize the line-up:
+ LOG("> "); LOG_STRING(haystack, len_haystack);
+ LOG("\n> "); LOG("%*s", window - haystack, "");
+ LOG_STRING(needle, len_needle);
+ LOG("\n> "); LOG("%*s", window - haystack + cut, "");
+ LOG(" ^ <-- cut\n");
+
+ if (window[cut] != needle_cut) {
+ // Sunday's trick: if we're going to jump, we might
+ // as well jump to line up the character *after* the
+ // current window.
+ STRINGLIB_CHAR first_outside = window[len_needle];
+ SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
+ if (shift == NOT_FOUND) {
+ LOG("\"%c\" not found. Skipping entirely.\n",
+ first_outside);
+ window += len_needle + 1;
+ }
+ else {
+ LOG("Shifting to line up \"%c\".\n", first_outside);
+ window += shift;
+ }
+ goto windowloop;
+ }
+ for (Py_ssize_t i = cut + 1; i < len_needle; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Right half does not match. Advance by %d.\n",
+ i - cut + 1);
+ window += i - cut + 1;
+ goto windowloop;
+ }
+ }
+ for (Py_ssize_t i = 0; i < cut; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Left half does not match. Advance by period %d.\n",
+ period);
+ window += period;
+ goto windowloop;
+ }
+ }
+ LOG("Left half matches. Returning %d.\n", window - haystack);
+ return window - haystack;
+ }
+ }
+ LOG("Not found. Returning -1.\n");
+ return -1;
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack,
+ Py_ssize_t len_haystack,
+ const STRINGLIB_CHAR *needle,
+ Py_ssize_t len_needle)
+{
+ LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack);
+ STRINGLIB(prework) p;
+ STRINGLIB(_preprocess)(needle, len_needle, &p);
+ return STRINGLIB(_two_way)(haystack, len_haystack, &p);
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack,
+ Py_ssize_t len_haystack,
+ const STRINGLIB_CHAR *needle,
+ Py_ssize_t len_needle,
+ Py_ssize_t maxcount)
+{
+ LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack);
+ STRINGLIB(prework) p;
+ STRINGLIB(_preprocess)(needle, len_needle, &p);
+ Py_ssize_t index = 0, count = 0;
+ while (1) {
+ Py_ssize_t result;
+ result = STRINGLIB(_two_way)(haystack + index,
+ len_haystack - index, &p);
+ if (result == -1) {
+ return count;
+ }
+ count++;
+ if (count == maxcount) {
+ return maxcount;
+ }
+ index += result + len_needle;
+ }
+ return count;
+}
+
+#undef SHIFT_TYPE
+#undef NOT_FOUND
+#undef SHIFT_OVERFLOW
+#undef TABLE_SIZE_BITS
+#undef TABLE_SIZE
+#undef TABLE_MASK
+
+#undef LOG
+#undef LOG_STRING
+
Py_LOCAL_INLINE(Py_ssize_t)
FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
const STRINGLIB_CHAR* p, Py_ssize_t m,
@@ -195,10 +548,22 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
}
mlast = m - 1;
- skip = mlast - 1;
+ skip = mlast;
mask = 0;
if (mode != FAST_RSEARCH) {
+ if (m >= 100 && w >= 2000 && w / m >= 5) {
+ /* For larger problems where the needle isn't a huge
+ percentage of the size of the haystack, the relatively
+ expensive O(m) startup cost of the two-way algorithm
+ will surely pay off. */
+ if (mode == FAST_SEARCH) {
+ return STRINGLIB(_two_way_find)(s, n, p, m);
+ }
+ else {
+ return STRINGLIB(_two_way_count)(s, n, p, m, maxcount);
+ }
+ }
const STRINGLIB_CHAR *ss = s + m - 1;
const STRINGLIB_CHAR *pp = p + m - 1;
@@ -207,41 +572,118 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* process pattern[:-1] */
for (i = 0; i < mlast; i++) {
STRINGLIB_BLOOM_ADD(mask, p[i]);
- if (p[i] == p[mlast])
+ if (p[i] == p[mlast]) {
skip = mlast - i - 1;
+ }
}
/* process pattern[-1] outside the loop */
STRINGLIB_BLOOM_ADD(mask, p[mlast]);
+ if (m >= 100 && w >= 8000) {
+ /* To ensure that we have good worst-case behavior,
+ here's an adaptive version of the algorithm, where if
+ we match O(m) characters without any matches of the
+ entire needle, then we predict that the startup cost of
+ the two-way algorithm will probably be worth it. */
+ Py_ssize_t hits = 0;
+ for (i = 0; i <= w; i++) {
+ if (ss[i] == pp[0]) {
+ /* candidate match */
+ for (j = 0; j < mlast; j++) {
+ if (s[i+j] != p[j]) {
+ break;
+ }
+ }
+ if (j == mlast) {
+ /* got a match! */
+ if (mode != FAST_COUNT) {
+ return i;
+ }
+ count++;
+ if (count == maxcount) {
+ return maxcount;
+ }
+ i = i + mlast;
+ continue;
+ }
+ /* miss: check if next character is part of pattern */
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
+ i = i + m;
+ }
+ else {
+ i = i + skip;
+ }
+ hits += j + 1;
+ if (hits >= m / 4 && i < w - 1000) {
+ /* We've done O(m) fruitless comparisons
+ anyway, so spend the O(m) cost on the
+ setup for the two-way algorithm. */
+ Py_ssize_t res;
+ if (mode == FAST_COUNT) {
+ res = STRINGLIB(_two_way_count)(
+ s+i, n-i, p, m, maxcount-count);
+ return count + res;
+ }
+ else {
+ res = STRINGLIB(_two_way_find)(s+i, n-i, p, m);
+ if (res == -1) {
+ return -1;
+ }
+ return i + res;
+ }
+ }
+ }
+ else {
+ /* skip: check if next character is part of pattern */
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
+ i = i + m;
+ }
+ }
+ }
+ if (mode != FAST_COUNT) {
+ return -1;
+ }
+ return count;
+ }
+ /* The standard, non-adaptive version of the algorithm. */
for (i = 0; i <= w; i++) {
/* note: using mlast in the skip path slows things down on x86 */
if (ss[i] == pp[0]) {
/* candidate match */
- for (j = 0; j < mlast; j++)
- if (s[i+j] != p[j])
+ for (j = 0; j < mlast; j++) {
+ if (s[i+j] != p[j]) {
break;
+ }
+ }
if (j == mlast) {
/* got a match! */
- if (mode != FAST_COUNT)
+ if (mode != FAST_COUNT) {
return i;
+ }
count++;
- if (count == maxcount)
+ if (count == maxcount) {
return maxcount;
+ }
i = i + mlast;
continue;
}
/* miss: check if next character is part of pattern */
- if (!STRINGLIB_BLOOM(mask, ss[i+1]))
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
- else
+ }
+ else {
i = i + skip;
- } else {
+ }
+ }
+ else {
/* skip: check if next character is part of pattern */
- if (!STRINGLIB_BLOOM(mask, ss[i+1]))
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
+ }
}
}
- } else { /* FAST_RSEARCH */
+ }
+ else { /* FAST_RSEARCH */
/* create compressed boyer-moore delta 1 table */
@@ -250,28 +692,36 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* process pattern[:0:-1] */
for (i = mlast; i > 0; i--) {
STRINGLIB_BLOOM_ADD(mask, p[i]);
- if (p[i] == p[0])
+ if (p[i] == p[0]) {
skip = i - 1;
+ }
}
for (i = w; i >= 0; i--) {
if (s[i] == p[0]) {
/* candidate match */
- for (j = mlast; j > 0; j--)
- if (s[i+j] != p[j])
+ for (j = mlast; j > 0; j--) {
+ if (s[i+j] != p[j]) {
break;
- if (j == 0)
+ }
+ }
+ if (j == 0) {
/* got a match! */
return i;
+ }
/* miss: check if previous character is part of pattern */
- if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
+ if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
i = i - m;
- else
+ }
+ else {
i = i - skip;
- } else {
+ }
+ }
+ else {
/* skip: check if previous character is part of pattern */
- if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
+ if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
i = i - m;
+ }
}
}
}
diff --git a/contrib/tools/python3/src/Objects/stringlib/find_max_char.h b/contrib/tools/python3/src/Objects/stringlib/find_max_char.h
index f4e0a7761d..b9ffdfc2e3 100644
--- a/contrib/tools/python3/src/Objects/stringlib/find_max_char.h
+++ b/contrib/tools/python3/src/Objects/stringlib/find_max_char.h
@@ -4,14 +4,14 @@
# error "find_max_char.h is specific to Unicode"
#endif
-/* Mask to quickly check whether a C 'long' contains a
+/* Mask to quickly check whether a C 'size_t' contains a
non-ASCII, UTF8-encoded char. */
-#if (SIZEOF_LONG == 8)
-# define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL
-#elif (SIZEOF_LONG == 4)
-# define UCS1_ASCII_CHAR_MASK 0x80808080UL
+#if (SIZEOF_SIZE_T == 8)
+# define UCS1_ASCII_CHAR_MASK 0x8080808080808080ULL
+#elif (SIZEOF_SIZE_T == 4)
+# define UCS1_ASCII_CHAR_MASK 0x80808080U
#else
-# error C 'long' size should be either 4 or 8!
+# error C 'size_t' size should be either 4 or 8!
#endif
#if STRINGLIB_SIZEOF_CHAR == 1
@@ -20,18 +20,16 @@ Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
{
const unsigned char *p = (const unsigned char *) begin;
- const unsigned char *aligned_end =
- (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
while (p < end) {
- if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
+ if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
/* Help register allocation */
const unsigned char *_p = p;
- while (_p < aligned_end) {
- unsigned long value = *(const unsigned long *) _p;
+ while (_p + SIZEOF_SIZE_T <= end) {
+ size_t value = *(const size_t *) _p;
if (value & UCS1_ASCII_CHAR_MASK)
return 255;
- _p += SIZEOF_LONG;
+ _p += SIZEOF_SIZE_T;
}
p = _p;
if (p == end)
diff --git a/contrib/tools/python3/src/Objects/stringlib/join.h b/contrib/tools/python3/src/Objects/stringlib/join.h
index 53bcbdea7a..62e4c98de7 100644
--- a/contrib/tools/python3/src/Objects/stringlib/join.h
+++ b/contrib/tools/python3/src/Objects/stringlib/join.h
@@ -155,7 +155,7 @@ done:
for (i = 0; i < nbufs; i++)
PyBuffer_Release(&buffers[i]);
if (buffers != static_buffers)
- PyMem_FREE(buffers);
+ PyMem_Free(buffers);
return res;
}
diff --git a/contrib/tools/python3/src/Objects/stringlib/partition.h b/contrib/tools/python3/src/Objects/stringlib/partition.h
index ed32a6f2b3..bcc217697b 100644
--- a/contrib/tools/python3/src/Objects/stringlib/partition.h
+++ b/contrib/tools/python3/src/Objects/stringlib/partition.h
@@ -1,9 +1,14 @@
/* stringlib: partition implementation */
#ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+# error must include "stringlib/fastsearch.h" before including this module
#endif
+#if !STRINGLIB_MUTABLE && !defined(STRINGLIB_GET_EMPTY)
+# error "STRINGLIB_GET_EMPTY must be defined if STRINGLIB_MUTABLE is zero"
+#endif
+
+
Py_LOCAL_INLINE(PyObject*)
STRINGLIB(partition)(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
@@ -37,10 +42,12 @@ STRINGLIB(partition)(PyObject* str_obj,
#else
Py_INCREF(str_obj);
PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
+ PyObject *empty = (PyObject*)STRINGLIB_GET_EMPTY();
+ assert(empty != NULL);
+ Py_INCREF(empty);
+ PyTuple_SET_ITEM(out, 1, empty);
+ Py_INCREF(empty);
+ PyTuple_SET_ITEM(out, 2, empty);
#endif
return out;
}
@@ -90,10 +97,12 @@ STRINGLIB(rpartition)(PyObject* str_obj,
return NULL;
}
#else
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
- Py_INCREF(STRINGLIB_EMPTY);
- PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
+ PyObject *empty = (PyObject*)STRINGLIB_GET_EMPTY();
+ assert(empty != NULL);
+ Py_INCREF(empty);
+ PyTuple_SET_ITEM(out, 0, empty);
+ Py_INCREF(empty);
+ PyTuple_SET_ITEM(out, 1, empty);
Py_INCREF(str_obj);
PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
#endif
diff --git a/contrib/tools/python3/src/Objects/stringlib/stringdefs.h b/contrib/tools/python3/src/Objects/stringlib/stringdefs.h
index ce27f3e408..88641b25d4 100644
--- a/contrib/tools/python3/src/Objects/stringlib/stringdefs.h
+++ b/contrib/tools/python3/src/Objects/stringlib/stringdefs.h
@@ -13,7 +13,6 @@
#define STRINGLIB_CHAR char
#define STRINGLIB_TYPE_NAME "string"
#define STRINGLIB_PARSE_CODE "S"
-#define STRINGLIB_EMPTY nullstring
#define STRINGLIB_ISSPACE Py_ISSPACE
#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9'))
diff --git a/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h b/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h
index bc4b104f11..5b0b8a025e 100644
--- a/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h
+++ b/contrib/tools/python3/src/Objects/stringlib/ucs1lib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_EMPTY unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h b/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h
index 86a1dff1b5..6af01511c5 100644
--- a/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h
+++ b/contrib/tools/python3/src/Objects/stringlib/ucs2lib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_EMPTY unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h b/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h
index 3c32a93c96..39071a0cdf 100644
--- a/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h
+++ b/contrib/tools/python3/src/Objects/stringlib/ucs4lib.h
@@ -11,7 +11,6 @@
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_EMPTY unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
diff --git a/contrib/tools/python3/src/Objects/stringlib/unicode_format.h b/contrib/tools/python3/src/Objects/stringlib/unicode_format.h
index b526ad21b8..7152ec6ebe 100644
--- a/contrib/tools/python3/src/Objects/stringlib/unicode_format.h
+++ b/contrib/tools/python3/src/Objects/stringlib/unicode_format.h
@@ -983,7 +983,7 @@ static void
formatteriter_dealloc(formatteriterobject *it)
{
Py_XDECREF(it->str);
- PyObject_FREE(it);
+ PyObject_Free(it);
}
/* returns a tuple:
@@ -1147,7 +1147,7 @@ static void
fieldnameiter_dealloc(fieldnameiterobject *it)
{
Py_XDECREF(it->str);
- PyObject_FREE(it);
+ PyObject_Free(it);
}
/* returns a tuple:
diff --git a/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h b/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h
index 3db5629e11..5ea79cd4f5 100644
--- a/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h
+++ b/contrib/tools/python3/src/Objects/stringlib/unicodedefs.h
@@ -13,7 +13,6 @@
#define STRINGLIB_CHAR Py_UNICODE
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
-#define STRINGLIB_EMPTY unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL