diff options
author | AlexSm <alex@ydb.tech> | 2024-03-05 10:40:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-05 12:40:59 +0300 |
commit | 1ac13c847b5358faba44dbb638a828e24369467b (patch) | |
tree | 07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Objects/stringlib/split.h | |
parent | ffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff) | |
download | ydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz |
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com>
Co-authored-by: deshevoy <deshevoy@yandex-team.com>
Co-authored-by: robot-contrib <robot-contrib@yandex-team.com>
Co-authored-by: thegeorg <thegeorg@yandex-team.com>
Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com>
Co-authored-by: svidyuk <svidyuk@yandex-team.com>
Co-authored-by: shadchin <shadchin@yandex-team.com>
Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com>
Co-authored-by: innokentii <innokentii@yandex-team.com>
Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com>
Co-authored-by: snermolaev <snermolaev@yandex-team.com>
Co-authored-by: dimdim11 <dimdim11@yandex-team.com>
Co-authored-by: kickbutt <kickbutt@yandex-team.com>
Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com>
Co-authored-by: korsunandrei <korsunandrei@yandex-team.com>
Co-authored-by: petrk <petrk@yandex-team.com>
Co-authored-by: miroslav2 <miroslav2@yandex-team.com>
Co-authored-by: serjflint <serjflint@yandex-team.com>
Co-authored-by: akhropov <akhropov@yandex-team.com>
Co-authored-by: prettyboy <prettyboy@yandex-team.com>
Co-authored-by: ilikepugs <ilikepugs@yandex-team.com>
Co-authored-by: hiddenpath <hiddenpath@yandex-team.com>
Co-authored-by: mikhnenko <mikhnenko@yandex-team.com>
Co-authored-by: spreis <spreis@yandex-team.com>
Co-authored-by: andreyshspb <andreyshspb@yandex-team.com>
Co-authored-by: dimaandreev <dimaandreev@yandex-team.com>
Co-authored-by: rashid <rashid@yandex-team.com>
Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com>
Co-authored-by: r-vetrov <r-vetrov@yandex-team.com>
Co-authored-by: ypodlesov <ypodlesov@yandex-team.com>
Co-authored-by: zaverden <zaverden@yandex-team.com>
Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com>
Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com>
Co-authored-by: v-korovin <v-korovin@yandex-team.com>
Co-authored-by: arikon <arikon@yandex-team.com>
Co-authored-by: khoden <khoden@yandex-team.com>
Co-authored-by: psydmm <psydmm@yandex-team.com>
Co-authored-by: robot-javacom <robot-javacom@yandex-team.com>
Co-authored-by: dtorilov <dtorilov@yandex-team.com>
Co-authored-by: sennikovmv <sennikovmv@yandex-team.com>
Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Objects/stringlib/split.h')
-rw-r--r-- | contrib/tools/python3/Objects/stringlib/split.h | 390 |
1 files changed, 390 insertions, 0 deletions
diff --git a/contrib/tools/python3/Objects/stringlib/split.h b/contrib/tools/python3/Objects/stringlib/split.h new file mode 100644 index 0000000000..0c11b7214e --- /dev/null +++ b/contrib/tools/python3/Objects/stringlib/split.h @@ -0,0 +1,390 @@ +/* stringlib: split implementation */ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +/* Overallocate the initial list to reduce the number of reallocs for small + split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three + resizes, to sizes 4, 8, then 16. Most observed string splits are for human + text (roughly 11 words per line) and field delimited data (usually 1-10 + fields). For large strings the split algorithms are bandwidth limited + so increasing the preallocation likely will not improve things.*/ + +#define MAX_PREALLOC 12 + +/* 5 splits gives 6 elements */ +#define PREALLOC_SIZE(maxsplit) \ + (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) + +#define SPLIT_APPEND(data, left, right) \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); + +#define SPLIT_ADD(data, left, right) { \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (count < MAX_PREALLOC) { \ + PyList_SET_ITEM(list, count, sub); \ + } else { \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); \ + } \ + count++; } + + +/* Always force the list to the expected size. */ +#define FIX_PREALLOC_SIZE(list) Py_SET_SIZE(list, count) + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(split_whitespace)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i == str_len) break; + j = i; i++; + while (i < str_len && !STRINGLIB_ISSPACE(str[i])) + i++; +#if !STRINGLIB_MUTABLE + if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, j, i); + } + + if (i < str_len) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to end of string */ + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i != str_len) + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(split_char)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while ((j < str_len) && (maxcount-- > 0)) { + for(; j < str_len; j++) { + /* I found that using memchr makes no difference */ + if (str[j] == ch) { + SPLIT_ADD(str, i, j); + i = j = j + 1; + break; + } + } + } +#if !STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (i <= str_len) { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(split)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + if (pos < 0) + break; + j = i + pos; + SPLIT_ADD(str, i, j); + i = j + sep_len; + } +#if !STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(rsplit_whitespace)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while (maxcount-- > 0) { + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i < 0) break; + j = i; i--; + while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) + i--; +#if !STRINGLIB_MUTABLE + if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, i + 1, j + 1); + } + + if (i >= 0) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to beginning of string */ + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i >= 0) + SPLIT_ADD(str, 0, i + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(rsplit_char)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while ((i >= 0) && (maxcount-- > 0)) { + for(; i >= 0; i--) { + if (str[i] == ch) { + SPLIT_ADD(str, i + 1, j + 1); + j = i = i - 1; + break; + } + } + } +#if !STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (j >= -1) { + SPLIT_ADD(str, 0, j + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(rsplit)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + j = str_len; + while (maxcount-- > 0) { + pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); + if (pos < 0) + break; + SPLIT_ADD(str, pos + sep_len, j); + j = pos; + } +#if !STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, 0, j); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(splitlines)(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + int keepends) +{ + /* This does not use the preallocated list because splitlines is + usually run with hundreds of newlines. The overhead of + switching between PyList_SET_ITEM and append causes about a + 2-3% slowdown for that common case. A smarter implementation + could move the if check out, so the SET_ITEMs are done first + and the appends only done when the prealloc buffer is full. + That's too much work for little gain.*/ + + Py_ssize_t i; + Py_ssize_t j; + PyObject *list = PyList_New(0); + PyObject *sub; + + if (list == NULL) + return NULL; + + for (i = j = 0; i < str_len; ) { + Py_ssize_t eol; + + /* Find a line and append it */ + while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) + i++; + + /* Skip the line break reading CRLF as one line break */ + eol = i; + if (i < str_len) { + if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') + i += 2; + else + i++; + if (keepends) + eol = i; + } +#if !STRINGLIB_MUTABLE + if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No linebreak in str_obj, so just use it as list[0] */ + if (PyList_Append(list, str_obj)) + goto onError; + break; + } +#endif + SPLIT_APPEND(str, j, eol); + j = i; + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + |