diff options
author | maxim-yurchuk <maxim-yurchuk@yandex-team.com> | 2024-10-09 12:29:46 +0300 |
---|---|---|
committer | maxim-yurchuk <maxim-yurchuk@yandex-team.com> | 2024-10-09 13:14:22 +0300 |
commit | 9731d8a4bb7ee2cc8554eaf133bb85498a4c7d80 (patch) | |
tree | a8fb3181d5947c0d78cf402aa56e686130179049 /contrib/python/pandas/py3 | |
parent | a44b779cd359f06c3ebbef4ec98c6b38609d9d85 (diff) | |
download | ydb-9731d8a4bb7ee2cc8554eaf133bb85498a4c7d80.tar.gz |
publishFullContrib: true for ydb
<HIDDEN_URL>
commit_hash:c82a80ac4594723cebf2c7387dec9c60217f603e
Diffstat (limited to 'contrib/python/pandas/py3')
12 files changed, 3311 insertions, 0 deletions
diff --git a/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml b/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml new file mode 100644 index 0000000000..b0e7556343 --- /dev/null +++ b/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml @@ -0,0 +1,17 @@ +copy: + - LICENSES/* +exclude_from_macros: + - LICENSES/* + - symbols.cmake + - pandas/_libs/src/headers/cmath +keep: + - symbols.cmake +cython_directive: + - language_level=3 +cython_templates: + - pandas/*.pxi.in +mark_as_cython_c: + - pandas/_libs/[!w]*.pyx + - pandas/_libs/window/indexers.pyx + - pandas/_libs/writers.pyx + - pandas/io/sas/sas.pyx diff --git a/contrib/python/pandas/py3/LICENSES/HAVEN_MIT b/contrib/python/pandas/py3/LICENSES/HAVEN_MIT new file mode 100644 index 0000000000..b03d0e6406 --- /dev/null +++ b/contrib/python/pandas/py3/LICENSES/HAVEN_MIT @@ -0,0 +1,32 @@ +Based on http://opensource.org/licenses/MIT + +This is a template. Complete and ship as file LICENSE the following 2 +lines (only) + +YEAR: +COPYRIGHT HOLDER: + +and specify as + +License: MIT + file LICENSE + +Copyright (c) <YEAR>, <COPYRIGHT HOLDER> + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/contrib/python/pandas/py3/LICENSES/OTHER b/contrib/python/pandas/py3/LICENSES/OTHER new file mode 100644 index 0000000000..7446d68eb4 --- /dev/null +++ b/contrib/python/pandas/py3/LICENSES/OTHER @@ -0,0 +1,75 @@ +Bottleneck license +------------------ + +Copyright (c) 2010-2012 Archipel Asset Management AB. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +google-api-python-client license +-------------------------------- + +Copyright (C) 2012 Google Inc. +All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Pyperclip v1.3 license +---------------------- + +Copyright (c) 2010, Albert Sweigart +All rights reserved. + +BSD-style license: + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the pyperclip nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in new file mode 100644 index 0000000000..ee815b8bbf --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in @@ -0,0 +1,73 @@ +""" +Template for each `dtype` helper function using 1-d template + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +# ---------------------------------------------------------------------- +# ensure_dtype +# ---------------------------------------------------------------------- + + +def ensure_platform_int(object arr): + # GH3033, GH1392 + # platform int is the size of the int pointer, e.g. np.intp + if util.is_array(arr): + if (<ndarray>arr).descr.type_num == cnp.NPY_INTP: + return arr + else: + # equiv: arr.astype(np.intp) + return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INTP) + else: + return np.array(arr, dtype=np.intp) + + +def ensure_object(object arr): + if util.is_array(arr): + if (<ndarray>arr).descr.type_num == NPY_OBJECT: + return arr + else: + # equiv: arr.astype(object) + return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT) + else: + return np.array(arr, dtype=np.object_) + +{{py: + +# name, c_type, dtype +dtypes = [('float64', 'FLOAT64', 'float64'), + # ('float32', 'FLOAT32', 'float32'), # disabling bc unused + ('int8', 'INT8', 'int8'), + ('int16', 'INT16', 'int16'), + ('int32', 'INT32', 'int32'), + ('int64', 'INT64', 'int64'), + ('uint64', 'UINT64', 'uint64'), + # Disabling uint and complex dtypes because we do not use them + # (and compiling them increases wheel size) (except uint64) + # ('uint8', 'UINT8', 'uint8'), + # ('uint16', 'UINT16', 'uint16'), + # ('uint32', 'UINT32', 'uint32'), + # ('complex64', 'COMPLEX64', 'complex64'), + # ('complex128', 'COMPLEX128', 'complex128') +] + +def get_dispatch(dtypes): + + for name, c_type, dtype in dtypes: + yield name, c_type, dtype +}} + +{{for name, c_type, dtype in get_dispatch(dtypes)}} + + +def ensure_{{name}}(object arr): + if util.is_array(arr): + if (<ndarray>arr).descr.type_num == NPY_{{c_type}}: + return arr + else: + # equiv: arr.astype(np.{{dtype}}) + return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_{{c_type}}) + else: + return np.asarray(arr, dtype=np.{{dtype}}) + +{{endfor}} diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in new file mode 100644 index 0000000000..2a3858674a --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in @@ -0,0 +1,222 @@ +""" +Template for each `dtype` helper function for take + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +# ---------------------------------------------------------------------- +# take_1d, take_2d +# ---------------------------------------------------------------------- + + +{{py: + +# c_type_in, c_type_out +dtypes = [ + ('uint8_t', 'uint8_t'), + ('uint8_t', 'object'), + ('int8_t', 'int8_t'), + ('int8_t', 'int32_t'), + ('int8_t', 'int64_t'), + ('int8_t', 'float64_t'), + ('int16_t', 'int16_t'), + ('int16_t', 'int32_t'), + ('int16_t', 'int64_t'), + ('int16_t', 'float64_t'), + ('int32_t', 'int32_t'), + ('int32_t', 'int64_t'), + ('int32_t', 'float64_t'), + ('int64_t', 'int64_t'), + ('int64_t', 'float64_t'), + ('float32_t', 'float32_t'), + ('float32_t', 'float64_t'), + ('float64_t', 'float64_t'), + ('object', 'object'), +] + + +def get_dispatch(dtypes): + + for (c_type_in, c_type_out) in dtypes: + + def get_name(dtype_name): + if dtype_name == "object": + return "object" + if dtype_name == "uint8_t": + return "bool" + return dtype_name[:-2] + + name = get_name(c_type_in) + dest = get_name(c_type_out) + + args = dict(name=name, dest=dest, c_type_in=c_type_in, + c_type_out=c_type_out) + + yield (name, dest, c_type_in, c_type_out) + +}} + + +{{for name, dest, c_type_in, c_type_out in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if c_type_in != "object"}} +def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values, +{{else}} +def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, +{{endif}} + const intp_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + + cdef: + Py_ssize_t i, n, idx + {{c_type_out}} fv + + n = indexer.shape[0] + + fv = fill_value + + {{if c_type_out != "object"}} + with nogil: + {{else}} + if True: + {{endif}} + for i in range(n): + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i] = True if values[idx] > 0 else False + {{else}} + out[i] = values[idx] + {{endif}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if c_type_in != "object"}} +def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, +{{else}} +def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, +{{endif}} + ndarray[intp_t, ndim=1] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + {{c_type_out}} fv + {{if c_type_in == c_type_out != "object"}} + const {{c_type_out}} *v + {{c_type_out}} *o + {{endif}} + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + {{if c_type_in == c_type_out != "object"}} + # GH#3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof({{c_type_out}}) and + sizeof({{c_type_out}}) * n >= 256): + + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, <size_t>(sizeof({{c_type_out}}) * k)) + return + {{endif}} + + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + for j in range(k): + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i, j] = True if values[idx, j] > 0 else False + {{else}} + out[i, j] = values[idx, j] + {{endif}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if c_type_in != "object"}} +def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, +{{else}} +def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, +{{endif}} + ndarray[intp_t, ndim=1] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + + cdef: + Py_ssize_t i, j, k, n, idx + {{c_type_out}} fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i, j] = True if values[i, idx] > 0 else False + {{else}} + out[i, j] = values[i, idx] + {{endif}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + indexer, + ndarray[{{c_type_out}}, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[intp_t, ndim=1] idx0 = indexer[0] + ndarray[intp_t, ndim=1] idx1 = indexer[1] + {{c_type_out}} fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + out[i, j] = fv + else: + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i, j] = True if values[idx, idx1[j]] > 0 else False + {{else}} + out[i, j] = values[idx, idx1[j]] + {{endif}} + +{{endfor}} diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in new file mode 100644 index 0000000000..d4d3117a32 --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in @@ -0,0 +1,1506 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + + +{{py: + +# name +complex_types = ['complex64', + 'complex128'] +}} + +{{for name in complex_types}} +cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil: + cdef kh{{name}}_t res + res.real = val.real + res.imag = val.imag + return res + +{{endfor}} + + +{{py: + + +# name +c_types = ['khcomplex128_t', + 'khcomplex64_t', + 'float64_t', + 'float32_t', + 'int64_t', + 'int32_t', + 'int16_t', + 'int8_t', + 'uint64_t', + 'uint32_t', + 'uint16_t', + 'uint8_t'] +}} + +{{for c_type in c_types}} + +cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: + {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }} + return val.real != val.real or val.imag != val.imag + {{elif c_type in {'float64_t', 'float32_t'} }} + return val != val + {{else}} + return False + {{endif}} + + +{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }} +# are_equivalent_{{c_type}} is cimported via khash.pxd +{{else}} +cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil: + return val1 == val2 +{{endif}} + +{{endfor}} + + +{{py: + +# name +cimported_types = ['complex64', + 'complex128', + 'float32', + 'float64', + 'int8', + 'int16', + 'int32', + 'int64', + 'pymap', + 'str', + 'strbox', + 'uint8', + 'uint16', + 'uint32', + 'uint64'] +}} + +{{for name in cimported_types}} +from pandas._libs.khash cimport ( + kh_destroy_{{name}}, + kh_exist_{{name}}, + kh_get_{{name}}, + kh_init_{{name}}, + kh_put_{{name}}, + kh_resize_{{name}}, +) + +{{endfor}} + +# ---------------------------------------------------------------------- +# VectorData +# ---------------------------------------------------------------------- + +from pandas._libs.tslibs.util cimport get_c_string +from pandas._libs.missing cimport C_NA + + +{{py: + +# name, dtype, c_type +# the generated StringVector is not actually used +# but is included for completeness (rather ObjectVector is used +# for uniques in hashtables) + +dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), + ('Complex64', 'complex64', 'khcomplex64_t'), + ('Float64', 'float64', 'float64_t'), + ('Float32', 'float32', 'float32_t'), + ('Int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32_t'), + ('Int16', 'int16', 'int16_t'), + ('Int8', 'int8', 'int8_t'), + ('String', 'string', 'char *'), + ('UInt64', 'uint64', 'uint64_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('UInt8', 'uint8', 'uint8_t')] +}} + +{{for name, dtype, c_type in dtypes}} + + +{{if dtype != 'int64'}} +# Int64VectorData is defined in the .pxd file because it is needed (indirectly) +# by IntervalTree + +ctypedef struct {{name}}VectorData: + {{c_type}} *data + Py_ssize_t n, m + +{{endif}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_{{dtype}}({{name}}VectorData *data, + {{c_type}} x) nogil: + + data.data[data.n] = x + data.n += 1 + +{{endfor}} + +ctypedef fused vector_data: + Int64VectorData + Int32VectorData + Int16VectorData + Int8VectorData + UInt64VectorData + UInt32VectorData + UInt16VectorData + UInt8VectorData + Float64VectorData + Float32VectorData + Complex128VectorData + Complex64VectorData + StringVectorData + +cdef bint needs_resize(vector_data *data) nogil: + return data.n == data.m + +# ---------------------------------------------------------------------- +# Vector +# ---------------------------------------------------------------------- + +cdef class Vector: + # cdef readonly: + # bint external_view_exists + + def __cinit__(self): + self.external_view_exists = False + + +{{py: + +# name, dtype, c_type +dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), + ('Complex64', 'complex64', 'khcomplex64_t'), + ('Float64', 'float64', 'float64_t'), + ('UInt64', 'uint64', 'uint64_t'), + ('Int64', 'int64', 'int64_t'), + ('Float32', 'float32', 'float32_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('Int32', 'int32', 'int32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('Int16', 'int16', 'int16_t'), + ('UInt8', 'uint8', 'uint8_t'), + ('Int8', 'int8', 'int8_t')] + +}} + +{{for name, dtype, c_type in dtypes}} + +cdef class {{name}}Vector(Vector): + + # For int64 we have to put this declaration in the .pxd file; + # Int64Vector is the only one we need exposed for other cython files. + {{if dtype != 'int64'}} + cdef: + {{name}}VectorData *data + ndarray ao + {{endif}} + + def __cinit__(self): + self.data = <{{name}}VectorData *>PyMem_Malloc( + sizeof({{name}}VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) + self.data.data = <{{c_type}}*>self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m, refcheck=False) + self.data.data = <{{c_type}}*>self.ao.data + + def __dealloc__(self): + if self.data is not NULL: + PyMem_Free(self.data) + self.data = NULL + + def __len__(self) -> int: + return self.data.n + + cpdef ndarray to_array(self): + if self.data.m != self.data.n: + if self.external_view_exists: + # should never happen + raise ValueError("should have raised on append()") + self.ao.resize(self.data.n, refcheck=False) + self.data.m = self.data.n + self.external_view_exists = True + return self.ao + + cdef void append(self, {{c_type}} x): + + if needs_resize(self.data): + if self.external_view_exists: + raise ValueError("external reference but " + "Vector.resize() needed") + self.resize() + + append_data_{{dtype}}(self.data, x) + + cdef extend(self, const {{c_type}}[:] x): + for i in range(len(x)): + self.append(x[i]) + +{{endfor}} + +cdef class StringVector(Vector): + + cdef: + StringVectorData *data + + def __cinit__(self): + self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.data.data = <char **>malloc(self.data.m * sizeof(char *)) + if not self.data.data: + raise MemoryError() + + cdef resize(self): + cdef: + char **orig_data + Py_ssize_t i, m + + m = self.data.m + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + + orig_data = self.data.data + self.data.data = <char **>malloc(self.data.m * sizeof(char *)) + if not self.data.data: + raise MemoryError() + for i in range(m): + self.data.data[i] = orig_data[i] + + def __dealloc__(self): + if self.data is not NULL: + if self.data.data is not NULL: + free(self.data.data) + PyMem_Free(self.data) + self.data = NULL + + def __len__(self) -> int: + return self.data.n + + cpdef ndarray[object, ndim=1] to_array(self): + cdef: + ndarray ao + Py_ssize_t n + object val + + ao = np.empty(self.data.n, dtype=object) + for i in range(self.data.n): + val = self.data.data[i] + ao[i] = val + self.external_view_exists = True + self.data.m = self.data.n + return ao + + cdef void append(self, char *x): + + if needs_resize(self.data): + self.resize() + + append_data_string(self.data, x) + + cdef extend(self, ndarray[object] x): + for i in range(len(x)): + self.append(x[i]) + + +cdef class ObjectVector(Vector): + + cdef: + PyObject **data + Py_ssize_t n, m + ndarray ao + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = <PyObject**>self.ao.data + + def __len__(self) -> int: + return self.n + + cdef append(self, object obj): + if self.n == self.m: + if self.external_view_exists: + raise ValueError("external reference but " + "Vector.resize() needed") + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m, refcheck=False) + self.data = <PyObject**>self.ao.data + + Py_INCREF(obj) + self.data[self.n] = <PyObject*>obj + self.n += 1 + + cpdef ndarray[object, ndim=1] to_array(self): + if self.m != self.n: + if self.external_view_exists: + raise ValueError("should have raised on append()") + self.ao.resize(self.n, refcheck=False) + self.m = self.n + self.external_view_exists = True + return self.ao + + cdef extend(self, ndarray[object] x): + for i in range(len(x)): + self.append(x[i]) + +# ---------------------------------------------------------------------- +# HashTable +# ---------------------------------------------------------------------- + + +cdef class HashTable: + + pass + +{{py: + +# name, dtype, c_type, to_c_type +dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'), + ('Float64', 'float64', 'float64_t', ''), + ('UInt64', 'uint64', 'uint64_t', ''), + ('Int64', 'int64', 'int64_t', ''), + ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'), + ('Float32', 'float32', 'float32_t', ''), + ('UInt32', 'uint32', 'uint32_t', ''), + ('Int32', 'int32', 'int32_t', ''), + ('UInt16', 'uint16', 'uint16_t', ''), + ('Int16', 'int16', 'int16_t', ''), + ('UInt8', 'uint8', 'uint8_t', ''), + ('Int8', 'int8', 'int8_t', '')] + +}} + + +{{for name, dtype, c_type, to_c_type in dtypes}} + +cdef class {{name}}HashTable(HashTable): + + def __cinit__(self, int64_t size_hint=1, bint uses_mask=False): + self.table = kh_init_{{dtype}}() + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_{{dtype}}(self.table, size_hint) + + self.uses_mask = uses_mask + self.na_position = -1 + + def __len__(self) -> int: + return self.table.size + (0 if self.na_position == -1 else 1) + + def __dealloc__(self): + if self.table is not NULL: + kh_destroy_{{dtype}}(self.table) + self.table = NULL + + def __contains__(self, object key) -> bool: + # The caller is responsible to check for compatible NA values in case + # of masked arrays. + cdef: + khiter_t k + {{c_type}} ckey + + if self.uses_mask and checknull(key): + return -1 != self.na_position + + ckey = {{to_c_type}}(key) + k = kh_get_{{dtype}}(self.table, ckey) + return k != self.table.n_buckets + + def sizeof(self, deep: bool = False) -> int: + """ return the size of my table in bytes """ + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs + + def get_state(self) -> dict[str, int]: + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + + cpdef get_item(self, {{dtype}}_t val): + """Extracts the position of val from the hashtable. + + Parameters + ---------- + val : Scalar + The value that is looked up in the hashtable + + Returns + ------- + The position of the requested integer. + """ + + # Used in core.sorting, IndexEngine.get_loc + # Caller is responsible for checking for pd.NA + cdef: + khiter_t k + {{c_type}} cval + + cval = {{to_c_type}}(val) + k = kh_get_{{dtype}}(self.table, cval) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + cpdef get_na(self): + """Extracts the position of na_value from the hashtable. + + Returns + ------- + The position of the last na value. + """ + + if not self.uses_mask: + raise NotImplementedError + + if self.na_position == -1: + raise KeyError("NA") + return self.na_position + + cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): + # Used in libjoin + # Caller is responsible for checking for pd.NA + cdef: + khiter_t k + int ret = 0 + {{c_type}} ckey + + ckey = {{to_c_type}}(key) + k = kh_put_{{dtype}}(self.table, ckey, &ret) + if kh_exist_{{dtype}}(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + cpdef set_na(self, Py_ssize_t val): + # Caller is responsible for checking for pd.NA + cdef: + khiter_t k + int ret = 0 + {{c_type}} ckey + + if not self.uses_mask: + raise NotImplementedError + + self.na_position = val + + {{if dtype == "int64" }} + # We only use this for int64, can reduce build size and make .pyi + # more accurate by only implementing it for int64 + @cython.boundscheck(False) + def map_keys_to_values( + self, const {{dtype}}_t[:] keys, const int64_t[:] values + ) -> None: + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{c_type}} key + khiter_t k + + with nogil: + for i in range(n): + key = {{to_c_type}}(keys[i]) + k = kh_put_{{dtype}}(self.table, key, &ret) + self.table.vals[k] = <Py_ssize_t>values[i] + {{endif}} + + @cython.boundscheck(False) + def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None: + # Used in libindex, safe_sort + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{c_type}} val + khiter_t k + int8_t na_position = self.na_position + + if self.uses_mask and mask is None: + raise NotImplementedError # pragma: no cover + + with nogil: + if self.uses_mask: + for i in range(n): + if mask[i]: + na_position = i + else: + val= {{to_c_type}}(values[i]) + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = i + else: + for i in range(n): + val= {{to_c_type}}(values[i]) + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = i + self.na_position = na_position + + @cython.boundscheck(False) + def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray: + # -> np.ndarray[np.intp] + # Used in safe_sort, IndexEngine.get_indexer + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{c_type}} val + khiter_t k + intp_t[::1] locs = np.empty(n, dtype=np.intp) + int8_t na_position = self.na_position + + if self.uses_mask and mask is None: + raise NotImplementedError # pragma: no cover + + with nogil: + for i in range(n): + if self.uses_mask and mask[i]: + locs[i] = na_position + else: + val = {{to_c_type}}(values[i]) + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + @cython.boundscheck(False) + @cython.wraparound(False) + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + object mask=None, bint return_inverse=False, bint use_result_mask=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + uniques : {{name}}Vector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : bool, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". + return_inverse : bool, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + use_result_mask: bool, default False + Whether to create a result mask for the unique values. Not supported + with return_inverse=True. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[intp_t] (if return_inverse=True) + The labels from values to uniques + result_mask: ndarray[bool], if use_result_mask is true + The mask for the result values. + """ + cdef: + Py_ssize_t i, idx, count = count_prior, n = len(values) + intp_t[::1] labels + int ret = 0 + {{c_type}} val, na_value2 + khiter_t k + {{name}}VectorData *ud + UInt8Vector result_mask + UInt8VectorData *rmd + bint use_na_value, use_mask, seen_na = False + uint8_t[:] mask_values + + if return_inverse: + labels = np.empty(n, dtype=np.intp) + ud = uniques.data + use_na_value = na_value is not None + use_mask = mask is not None + if not use_mask and use_result_mask: + raise NotImplementedError # pragma: no cover + + if use_result_mask and return_inverse: + raise NotImplementedError # pragma: no cover + + result_mask = UInt8Vector() + rmd = result_mask.data + + if use_mask: + mask_values = mask.view("uint8") + + if use_na_value: + # We need this na_value2 because we want to allow users + # to *optionally* specify an NA sentinel *of the correct* type. + # We use None, to make it optional, which requires `object` type + # for the parameter. To please the compiler, we use na_value2, + # which is only used if it's *specified*. + na_value2 = {{to_c_type}}(na_value) + else: + na_value2 = {{to_c_type}}(0) + + with nogil: + for i in range(n): + val = {{to_c_type}}(values[i]) + + if ignore_na and use_mask: + if mask_values[i]: + labels[i] = na_sentinel + continue + elif ignore_na and ( + is_nan_{{c_type}}(val) or + (use_na_value and are_equivalent_{{c_type}}(val, na_value2)) + ): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, + # and replace the corresponding label with na_sentinel + labels[i] = na_sentinel + continue + elif not ignore_na and use_result_mask: + if mask_values[i]: + if seen_na: + continue + + seen_na = True + if needs_resize(ud): + with gil: + if uniques.external_view_exists: + raise ValueError("external reference to " + "uniques held, but " + "Vector.resize() needed") + uniques.resize() + if result_mask.external_view_exists: + raise ValueError("external reference to " + "result_mask held, but " + "Vector.resize() needed") + result_mask.resize() + append_data_{{dtype}}(ud, val) + append_data_uint8(rmd, 1) + continue + + k = kh_get_{{dtype}}(self.table, val) + + if k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_{{dtype}}(self.table, val, &ret) + + if needs_resize(ud): + with gil: + if uniques.external_view_exists: + raise ValueError("external reference to " + "uniques held, but " + "Vector.resize() needed") + uniques.resize() + if use_result_mask: + if result_mask.external_view_exists: + raise ValueError("external reference to " + "result_mask held, but " + "Vector.resize() needed") + result_mask.resize() + append_data_{{dtype}}(ud, val) + if use_result_mask: + append_data_uint8(rmd, 0) + + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + if return_inverse: + return uniques.to_array(), labels.base # .base -> underlying ndarray + if use_result_mask: + return uniques.to_array(), result_mask.to_array() + return uniques.to_array() + + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + return_inverse : bool, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[intp_t] (if return_inverse) + The labels from values to uniques + result_mask: ndarray[bool], if mask is given as input + The mask for the result values. + """ + uniques = {{name}}Vector() + use_result_mask = True if mask is not None else False + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask) + + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, + object na_value=None, object mask=None, ignore_na=True): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[intp_t] + The labels from values to uniques + """ + uniques_vector = {{name}}Vector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=ignore_na, mask=mask, + return_inverse=True) + + def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, object mask=None): + # -> np.ndarray[np.intp] + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True, mask=mask) + return labels + + {{if dtype == 'int64'}} + @cython.boundscheck(False) + def get_labels_groupby( + self, const {{dtype}}_t[:] values + ) -> tuple[ndarray, ndarray]: + # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]] + cdef: + Py_ssize_t i, n = len(values) + intp_t[::1] labels + Py_ssize_t idx, count = 0 + int ret = 0 + {{c_type}} val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + labels = np.empty(n, dtype=np.intp) + ud = uniques.data + + with nogil: + for i in range(n): + val = {{to_c_type}}(values[i]) + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + {{endif}} + + +cdef class {{name}}Factorizer(Factorizer): + cdef public: + {{name}}HashTable table + {{name}}Vector uniques + + def __cinit__(self, size_hint: int): + self.table = {{name}}HashTable(size_hint) + self.uniques = {{name}}Vector() + + def factorize(self, const {{c_type}}[:] values, + na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: + """ + Returns + ------- + ndarray[intp_t] + + Examples + -------- + Factorize values with nans replaced by na_sentinel + + >>> fac = {{name}}Factorizer(3) + >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20) + array([0, 1, 2]) + """ + cdef: + ndarray[intp_t] labels + + if self.uniques.external_view_exists: + uniques = {{name}}Vector() + uniques.extend(self.uniques.to_array()) + self.uniques = uniques + labels = self.table.get_labels(values, self.uniques, + self.count, na_sentinel, + na_value=na_value, mask=mask) + self.count = len(self.uniques) + return labels + +{{endfor}} + + +cdef class StringHashTable(HashTable): + # these by-definition *must* be strings + # or a sentinel np.nan / None missing value + na_string_sentinel = '__nan__' + + def __init__(self, int64_t size_hint=1): + self.table = kh_init_str() + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + kh_destroy_str(self.table) + self.table = NULL + + def sizeof(self, deep: bool = False) -> int: + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof(char *) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs + + def get_state(self) -> dict[str, int]: + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + + cpdef get_item(self, str val): + cdef: + khiter_t k + const char *v + v = get_c_string(val) + + k = kh_get_str(self.table, v) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + cpdef set_item(self, str key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + const char *v + + v = get_c_string(key) + + k = kh_put_str(self.table, v, &ret) + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def get_indexer(self, ndarray[object] values) -> ndarray: + # -> np.ndarray[np.intp] + cdef: + Py_ssize_t i, n = len(values) + ndarray[intp_t] labels = np.empty(n, dtype=np.intp) + intp_t *resbuf = <intp_t*>labels.data + khiter_t k + kh_str_t *table = self.table + const char *v + const char **vecs + + vecs = <const char **>malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + v = get_c_string(val) + vecs[i] = v + + with nogil: + for i in range(n): + k = kh_get_str(table, vecs[i]) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + + free(vecs) + return labels + + @cython.boundscheck(False) + def lookup(self, ndarray[object] values, object mask = None) -> ndarray: + # -> np.ndarray[np.intp] + # mask not yet implemented + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + const char *v + khiter_t k + intp_t[::1] locs = np.empty(n, dtype=np.intp) + + # these by-definition *must* be strings + vecs = <const char **>malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + + if isinstance(val, str): + # GH#31499 if we have a np.str_ get_c_string won't recognize + # it as a str, even though isinstance does. + v = get_c_string(<str>val) + else: + v = get_c_string(self.na_string_sentinel) + vecs[i] = v + + with nogil: + for i in range(n): + v = vecs[i] + k = kh_get_str(self.table, v) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + free(vecs) + return np.asarray(locs) + + @cython.boundscheck(False) + def map_locations(self, ndarray[object] values, object mask = None) -> None: + # mask not yet implemented + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + const char *v + const char **vecs + khiter_t k + + # these by-definition *must* be strings + vecs = <const char **>malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + + if isinstance(val, str): + # GH#31499 if we have a np.str_ get_c_string won't recognize + # it as a str, even though isinstance does. + v = get_c_string(<str>val) + else: + v = get_c_string(self.na_string_sentinel) + vecs[i] = v + + with nogil: + for i in range(n): + v = vecs[i] + k = kh_put_str(self.table, v, &ret) + self.table.vals[k] = i + free(vecs) + + @cython.boundscheck(False) + @cython.wraparound(False) + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + ignore_na : bool, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : bool, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[intp_t] (if return_inverse=True) + The labels from values to uniques + """ + cdef: + Py_ssize_t i, idx, count = count_prior, n = len(values) + intp_t[::1] labels + int64_t[::1] uindexer + int ret = 0 + object val + const char *v + const char **vecs + khiter_t k + bint use_na_value + + if return_inverse: + labels = np.zeros(n, dtype=np.intp) + uindexer = np.empty(n, dtype=np.int64) + use_na_value = na_value is not None + + # assign pointers and pre-filter out missing (if ignore_na) + vecs = <const char **>malloc(n * sizeof(char *)) + for i in range(n): + val = values[i] + + if (ignore_na + and (not isinstance(val, str) + or (use_na_value and val == na_value))): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), we can skip the actual value, and + # replace the label with na_sentinel directly + labels[i] = na_sentinel + else: + # if ignore_na is False, we also stringify NaN/None/etc. + try: + v = get_c_string(<str>val) + except UnicodeEncodeError: + v = get_c_string(<str>repr(val)) + vecs[i] = v + + # compute + with nogil: + for i in range(n): + if ignore_na and labels[i] == na_sentinel: + # skip entries for ignored missing values (see above) + continue + + v = vecs[i] + k = kh_get_str(self.table, v) + if k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_str(self.table, v, &ret) + uindexer[count] = i + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + free(vecs) + + # uniques + for i in range(count): + uniques.append(values[uindexer[i]]) + + if return_inverse: + return uniques.to_array(), labels.base # .base -> underlying ndarray + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : bool, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + mask : ndarray[bool], optional + Not yet implemented for StringHashTable + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[intp_t] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None, object mask=None, ignore_na=True): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + mask : ndarray[bool], optional + Not yet implemented for StringHashTable. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[intp] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=ignore_na, + return_inverse=True) + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + # -> np.ndarray[np.intp] + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels + + +cdef class PyObjectHashTable(HashTable): + + def __init__(self, int64_t size_hint=1): + self.table = kh_init_pymap() + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + kh_destroy_pymap(self.table) + self.table = NULL + + def __len__(self) -> int: + return self.table.size + + def __contains__(self, object key) -> bool: + cdef: + khiter_t k + hash(key) + + k = kh_get_pymap(self.table, <PyObject*>key) + return k != self.table.n_buckets + + def sizeof(self, deep: bool = False) -> int: + """ return the size of my table in bytes """ + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs + + def get_state(self) -> dict[str, int]: + """ + returns infos about the current state of the hashtable like size, + number of buckets and so on. + """ + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + + cpdef get_item(self, object val): + cdef: + khiter_t k + + k = kh_get_pymap(self.table, <PyObject*>val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + + k = kh_put_pymap(self.table, <PyObject*>key, &ret) + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values, object mask = None) -> None: + # mask not yet implemented + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + + k = kh_put_pymap(self.table, <PyObject*>val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values, object mask = None) -> ndarray: + # -> np.ndarray[np.intp] + # mask not yet implemented + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + intp_t[::1] locs = np.empty(n, dtype=np.intp) + + for i in range(n): + val = values[i] + hash(val) + + k = kh_get_pymap(self.table, <PyObject*>val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + @cython.boundscheck(False) + @cython.wraparound(False) + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + uniques : ObjectVector + Vector into which uniques will be written + count_prior : Py_ssize_t, default 0 + Number of existing entries in uniques + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : bool, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : bool, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[intp_t] (if return_inverse=True) + The labels from values to uniques + """ + cdef: + Py_ssize_t i, idx, count = count_prior, n = len(values) + intp_t[::1] labels + int ret = 0 + object val + khiter_t k + bint use_na_value + + if return_inverse: + labels = np.empty(n, dtype=np.intp) + use_na_value = na_value is not None + + for i in range(n): + val = values[i] + hash(val) + + if ignore_na and ( + checknull(val) + or (use_na_value and val == na_value) + ): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, and + # replace the corresponding label with na_sentinel + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, <PyObject*>val) + if k == self.table.n_buckets: + # k hasn't been seen yet + k = kh_put_pymap(self.table, <PyObject*>val, &ret) + uniques.append(val) + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + if return_inverse: + return uniques.to_array(), labels.base # .base -> underlying ndarray + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : bool, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + mask : ndarray[bool], optional + Not yet implemented for PyObjectHashTable + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[intp_t] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None, object mask=None, ignore_na=True): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + Not yet implemented for PyObjectHashTable. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[intp_t] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=ignore_na, + return_inverse=True) + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + # -> np.ndarray[np.intp] + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in new file mode 100644 index 0000000000..b9cf601148 --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in @@ -0,0 +1,484 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +{{py: + +# name, dtype, ttype, c_type, to_c_type +dtypes = [('Complex128', 'complex128', 'complex128', + 'khcomplex128_t', 'to_khcomplex128_t'), + ('Complex64', 'complex64', 'complex64', + 'khcomplex64_t', 'to_khcomplex64_t'), + ('Float64', 'float64', 'float64', 'float64_t', ''), + ('Float32', 'float32', 'float32', 'float32_t', ''), + ('UInt64', 'uint64', 'uint64', 'uint64_t', ''), + ('UInt32', 'uint32', 'uint32', 'uint32_t', ''), + ('UInt16', 'uint16', 'uint16', 'uint16_t', ''), + ('UInt8', 'uint8', 'uint8', 'uint8_t', ''), + ('Object', 'object', 'pymap', 'object', '<PyObject*>'), + ('Int64', 'int64', 'int64', 'int64_t', ''), + ('Int32', 'int32', 'int32', 'int32_t', ''), + ('Int16', 'int16', 'int16', 'int16_t', ''), + ('Int8', 'int8', 'int8', 'int8_t', '')] + +}} + +{{for name, dtype, ttype, c_type, to_c_type in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if dtype == 'object'}} +cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None): +{{else}} +cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): +{{endif}} + cdef: + Py_ssize_t i = 0 + Py_ssize_t n = len(values) + kh_{{ttype}}_t *table + + # Don't use Py_ssize_t, since table.n_buckets is unsigned + khiter_t k + + {{c_type}} val + + int ret = 0 + bint uses_mask = mask is not None + bint isna_entry = False + + if uses_mask and not dropna: + raise NotImplementedError("uses_mask not implemented with dropna=False") + + # we track the order in which keys are first seen (GH39009), + # khash-map isn't insertion-ordered, thus: + # table maps keys to counts + # result_keys remembers the original order of keys + + result_keys = {{name}}Vector() + table = kh_init_{{ttype}}() + + {{if dtype == 'object'}} + if uses_mask: + raise NotImplementedError("uses_mask not implemented with object dtype") + + kh_resize_{{ttype}}(table, n // 10) + + for i in range(n): + val = values[i] + if not dropna or not checknull(val): + k = kh_get_{{ttype}}(table, {{to_c_type}}val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret) + table.vals[k] = 1 + result_keys.append(val) + {{else}} + kh_resize_{{ttype}}(table, n) + + for i in range(n): + val = {{to_c_type}}(values[i]) + + if dropna: + if uses_mask: + isna_entry = mask[i] + else: + isna_entry = is_nan_{{c_type}}(val) + + if not dropna or not isna_entry: + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + result_keys.append(val) + {{endif}} + + # collect counts in the order corresponding to result_keys: + cdef: + int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64) + + for i in range(table.size): + {{if dtype == 'object'}} + k = kh_get_{{ttype}}(table, result_keys.data[i]) + {{else}} + k = kh_get_{{ttype}}(table, result_keys.data.data[i]) + {{endif}} + result_counts[i] = table.vals[k] + + kh_destroy_{{ttype}}(table) + + return result_keys.to_array(), result_counts.base + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if dtype == 'object'}} +cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None): +{{else}} +cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None): +{{endif}} + cdef: + int ret = 0 + {{if dtype != 'object'}} + {{c_type}} value + {{else}} + PyObject* value + {{endif}} + Py_ssize_t i, n = len(values), first_na = -1 + khiter_t k + kh_{{ttype}}_t *table = kh_init_{{ttype}}() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + bint seen_na = False, uses_mask = mask is not None + bint seen_multiple_na = False + + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + {{for cond, keep in [('if', '"last"'), ('elif', '"first"')]}} + {{cond}} keep == {{keep}}: + {{if dtype == 'object'}} + if True: + {{else}} + with nogil: + {{endif}} + {{if keep == '"last"'}} + for i in range(n - 1, -1, -1): + {{else}} + for i in range(n): + {{endif}} + if uses_mask and mask[i]: + if seen_na: + out[i] = True + else: + out[i] = False + seen_na = True + else: + value = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, value, &ret) + out[i] = ret == 0 + {{endfor}} + + else: + {{if dtype == 'object'}} + if True: + {{else}} + with nogil: + {{endif}} + for i in range(n): + if uses_mask and mask[i]: + if not seen_na: + first_na = i + seen_na = True + out[i] = 0 + elif not seen_multiple_na: + out[i] = 1 + out[first_na] = 1 + seen_multiple_na = True + else: + out[i] = 1 + + else: + value = {{to_c_type}}(values[i]) + k = kh_get_{{ttype}}(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{ttype}}(table, value, &ret) + table.vals[k] = i + out[i] = 0 + + kh_destroy_{{ttype}}(table) + return out + + +# ---------------------------------------------------------------------- +# Membership +# ---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if dtype == 'object'}} +cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): +{{else}} +cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): +{{endif}} + """ + Return boolean of values in arr on an + element by-element basis + + Parameters + ---------- + arr : {{dtype}} ndarray + values : {{dtype}} ndarray + + Returns + ------- + boolean ndarray len of (arr) + """ + cdef: + Py_ssize_t i, n + khiter_t k + int ret = 0 + ndarray[uint8_t] result + + {{if dtype == "object"}} + PyObject* val + {{else}} + {{c_type}} val + {{endif}} + + kh_{{ttype}}_t *table = kh_init_{{ttype}}() + + # construct the table + n = len(values) + kh_resize_{{ttype}}(table, n) + + {{if dtype == 'object'}} + if True: + {{else}} + with nogil: + {{endif}} + for i in range(n): + val = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, val, &ret) + + # test membership + n = len(arr) + result = np.empty(n, dtype=np.uint8) + + {{if dtype == 'object'}} + if True: + {{else}} + with nogil: + {{endif}} + for i in range(n): + val = {{to_c_type}}(arr[i]) + k = kh_get_{{ttype}}(table, val) + result[i] = (k != table.n_buckets) + + kh_destroy_{{ttype}}(table) + return result.view(np.bool_) + +# ---------------------------------------------------------------------- +# Mode Computations +# ---------------------------------------------------------------------- + +{{endfor}} + + +ctypedef fused htfunc_t: + numeric_object_t + complex128_t + complex64_t + + +cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): + if htfunc_t is object: + return value_count_object(values, dropna, mask=mask) + + elif htfunc_t is int8_t: + return value_count_int8(values, dropna, mask=mask) + elif htfunc_t is int16_t: + return value_count_int16(values, dropna, mask=mask) + elif htfunc_t is int32_t: + return value_count_int32(values, dropna, mask=mask) + elif htfunc_t is int64_t: + return value_count_int64(values, dropna, mask=mask) + + elif htfunc_t is uint8_t: + return value_count_uint8(values, dropna, mask=mask) + elif htfunc_t is uint16_t: + return value_count_uint16(values, dropna, mask=mask) + elif htfunc_t is uint32_t: + return value_count_uint32(values, dropna, mask=mask) + elif htfunc_t is uint64_t: + return value_count_uint64(values, dropna, mask=mask) + + elif htfunc_t is float64_t: + return value_count_float64(values, dropna, mask=mask) + elif htfunc_t is float32_t: + return value_count_float32(values, dropna, mask=mask) + + elif htfunc_t is complex128_t: + return value_count_complex128(values, dropna, mask=mask) + elif htfunc_t is complex64_t: + return value_count_complex64(values, dropna, mask=mask) + + else: + raise TypeError(values.dtype) + + +cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None): + if htfunc_t is object: + return duplicated_object(values, keep, mask=mask) + + elif htfunc_t is int8_t: + return duplicated_int8(values, keep, mask=mask) + elif htfunc_t is int16_t: + return duplicated_int16(values, keep, mask=mask) + elif htfunc_t is int32_t: + return duplicated_int32(values, keep, mask=mask) + elif htfunc_t is int64_t: + return duplicated_int64(values, keep, mask=mask) + + elif htfunc_t is uint8_t: + return duplicated_uint8(values, keep, mask=mask) + elif htfunc_t is uint16_t: + return duplicated_uint16(values, keep, mask=mask) + elif htfunc_t is uint32_t: + return duplicated_uint32(values, keep, mask=mask) + elif htfunc_t is uint64_t: + return duplicated_uint64(values, keep, mask=mask) + + elif htfunc_t is float64_t: + return duplicated_float64(values, keep, mask=mask) + elif htfunc_t is float32_t: + return duplicated_float32(values, keep, mask=mask) + + elif htfunc_t is complex128_t: + return duplicated_complex128(values, keep, mask=mask) + elif htfunc_t is complex64_t: + return duplicated_complex64(values, keep, mask=mask) + + else: + raise TypeError(values.dtype) + + +cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values): + if htfunc_t is object: + return ismember_object(arr, values) + + elif htfunc_t is int8_t: + return ismember_int8(arr, values) + elif htfunc_t is int16_t: + return ismember_int16(arr, values) + elif htfunc_t is int32_t: + return ismember_int32(arr, values) + elif htfunc_t is int64_t: + return ismember_int64(arr, values) + + elif htfunc_t is uint8_t: + return ismember_uint8(arr, values) + elif htfunc_t is uint16_t: + return ismember_uint16(arr, values) + elif htfunc_t is uint32_t: + return ismember_uint32(arr, values) + elif htfunc_t is uint64_t: + return ismember_uint64(arr, values) + + elif htfunc_t is float64_t: + return ismember_float64(arr, values) + elif htfunc_t is float32_t: + return ismember_float32(arr, values) + + elif htfunc_t is complex128_t: + return ismember_complex128(arr, values) + elif htfunc_t is complex64_t: + return ismember_complex64(arr, values) + + else: + raise TypeError(values.dtype) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): + # TODO(cython3): use const htfunct_t[:] + + cdef: + ndarray[htfunc_t] keys + ndarray[htfunc_t] modes + + int64_t[::1] counts + int64_t count, max_count = -1 + Py_ssize_t nkeys, k, j = 0 + + keys, counts = value_count(values, dropna, mask=mask) + nkeys = len(keys) + + modes = np.empty(nkeys, dtype=values.dtype) + + if htfunc_t is not object: + with nogil: + for k in range(nkeys): + count = counts[k] + if count == max_count: + j += 1 + elif count > max_count: + max_count = count + j = 0 + else: + continue + + modes[j] = keys[k] + else: + for k in range(nkeys): + count = counts[k] + if count == max_count: + j += 1 + elif count > max_count: + max_count = count + j = 0 + else: + continue + + modes[j] = keys[k] + + return modes[:j + 1] + + +{{py: + +# name, dtype, ttype, c_type +dtypes = [('Int64', 'int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32', 'int32_t'), ] + +}} + +{{for name, dtype, ttype, c_type in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: + """ + Indices of the first occurrences of the unique labels + *excluding* -1. equivalent to: + np.unique(labels, return_index=True)[1] + """ + cdef: + int ret = 0 + Py_ssize_t i, n = len(labels) + kh_{{ttype}}_t *table = kh_init_{{ttype}}() + {{name}}Vector idx = {{name}}Vector() + ndarray[{{c_type}}, ndim=1] arr + {{name}}VectorData *ud = idx.data + + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + + with nogil: + for i in range(n): + kh_put_{{ttype}}(table, labels[i], &ret) + if ret != 0: + if needs_resize(ud): + with gil: + idx.resize() + append_data_{{ttype}}(ud, i) + + kh_destroy_{{ttype}}(table) + + arr = idx.to_array() + arr = arr[np.asarray(labels)[arr].argsort()] + + return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr + +{{endfor}} diff --git a/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in new file mode 100644 index 0000000000..bf3d88edd9 --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in @@ -0,0 +1,78 @@ +""" +Template for functions of IndexEngine subclasses. + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +# ---------------------------------------------------------------------- +# IndexEngine Subclass Methods +# ---------------------------------------------------------------------- + +{{py: + +# name, dtype +dtypes = [('Float64', 'float64'), + ('Float32', 'float32'), + ('Int64', 'int64'), + ('Int32', 'int32'), + ('Int16', 'int16'), + ('Int8', 'int8'), + ('UInt64', 'uint64'), + ('UInt32', 'uint32'), + ('UInt16', 'uint16'), + ('UInt8', 'uint8'), + ('Complex64', 'complex64'), + ('Complex128', 'complex128'), + ] + +engines = [('', 'IndexEngine'), ('Masked', 'MaskedIndexEngine')] + +}} + +{{for name, dtype in dtypes}} + +{{for prefix, engine in engines}} + +cdef class {{prefix}}{{name}}Engine({{engine}}): + + cdef _make_hash_table(self, Py_ssize_t n): + {{if engine == 'MaskedIndexEngine'}} + return _hash.{{name}}HashTable(n, uses_mask=True) + {{else}} + return _hash.{{name}}HashTable(n) + {{endif}} + + cdef _check_type(self, object val): + {{if engine == 'MaskedIndexEngine'}} + if val is C_NA: + return val + {{endif}} + {{if name not in {'Float64', 'Float32', 'Complex64', 'Complex128'} }} + if not util.is_integer_object(val): + if util.is_float_object(val): + # Make sure Int64Index.get_loc(2.0) works + if val.is_integer(): + return int(val) + raise KeyError(val) + {{if name.startswith("U")}} + if val < 0: + # cannot have negative values with unsigned int dtype + raise KeyError(val) + {{endif}} + {{elif name not in {'Complex64', 'Complex128'} }} + if not util.is_integer_object(val) and not util.is_float_object(val): + # in particular catch bool and avoid casting True -> 1.0 + raise KeyError(val) + {{else}} + if (not util.is_integer_object(val) + and not util.is_float_object(val) + and not util.is_complex_object(val) + ): + # in particular catch bool and avoid casting True -> 1.0 + raise KeyError(val) + {{endif}} + return val + +{{endfor}} + +{{endfor}} diff --git a/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in b/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in new file mode 100644 index 0000000000..67fee7c5fb --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in @@ -0,0 +1,434 @@ +""" +Template for intervaltree + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +from pandas._libs.algos import is_monotonic + +ctypedef fused int_scalar_t: + int64_t + float64_t + +ctypedef fused uint_scalar_t: + uint64_t + float64_t + +ctypedef fused scalar_t: + int_scalar_t + uint_scalar_t + +# ---------------------------------------------------------------------- +# IntervalTree +# ---------------------------------------------------------------------- + +cdef class IntervalTree(IntervalMixin): + """A centered interval tree + + Based off the algorithm described on Wikipedia: + https://en.wikipedia.org/wiki/Interval_tree + + we are emulating the IndexEngine interface + """ + cdef readonly: + ndarray left, right + IntervalNode root + object dtype + str closed + object _is_overlapping, _left_sorter, _right_sorter + Py_ssize_t _na_count + + def __init__(self, left, right, closed='right', leaf_size=100): + """ + Parameters + ---------- + left, right : np.ndarray[ndim=1] + Left and right bounds for each interval. Assumed to contain no + NaNs. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + leaf_size : int, optional + Parameter that controls when the tree switches from creating nodes + to brute-force search. Tune this parameter to optimize query + performance. + """ + if closed not in ['left', 'right', 'both', 'neither']: + raise ValueError("invalid option for 'closed': %s" % closed) + + left = np.asarray(left) + right = np.asarray(right) + self.dtype = np.result_type(left, right) + self.left = np.asarray(left, dtype=self.dtype) + self.right = np.asarray(right, dtype=self.dtype) + + indices = np.arange(len(left), dtype='int64') + + self.closed = closed + + # GH 23352: ensure no nan in nodes + mask = ~np.isnan(self.left) + self._na_count = len(mask) - mask.sum() + self.left = self.left[mask] + self.right = self.right[mask] + indices = indices[mask] + + node_cls = NODE_CLASSES[str(self.dtype), closed] + self.root = node_cls(self.left, self.right, indices, leaf_size) + + @property + def left_sorter(self) -> np.ndarray: + """How to sort the left labels; this is used for binary search + """ + if self._left_sorter is None: + values = [self.right, self.left] + self._left_sorter = np.lexsort(values) + return self._left_sorter + + @property + def right_sorter(self) -> np.ndarray: + """How to sort the right labels + """ + if self._right_sorter is None: + self._right_sorter = np.argsort(self.right) + return self._right_sorter + + @property + def is_overlapping(self) -> bool: + """ + Determine if the IntervalTree contains overlapping intervals. + Cached as self._is_overlapping. + """ + if self._is_overlapping is not None: + return self._is_overlapping + + # <= when both sides closed since endpoints can overlap + op = le if self.closed == 'both' else lt + + # overlap if start of current interval < end of previous interval + # (current and previous in terms of sorted order by left/start side) + current = self.left[self.left_sorter[1:]] + previous = self.right[self.left_sorter[:-1]] + self._is_overlapping = bool(op(current, previous).any()) + + return self._is_overlapping + + @property + def is_monotonic_increasing(self) -> bool: + """ + Return True if the IntervalTree is monotonic increasing (only equal or + increasing values), else False + """ + if self._na_count > 0: + return False + + sort_order = self.left_sorter + return is_monotonic(sort_order, False)[0] + + def get_indexer(self, scalar_t[:] target) -> np.ndarray: + """Return the positions corresponding to unique intervals that overlap + with the given array of scalar targets. + """ + + # TODO: write get_indexer_intervals + cdef: + Py_ssize_t old_len + Py_ssize_t i + Int64Vector result + + result = Int64Vector() + old_len = 0 + for i in range(len(target)): + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + + if result.data.n == old_len: + result.append(-1) + elif result.data.n > old_len + 1: + raise KeyError( + 'indexer does not intersect a unique set of intervals') + old_len = result.data.n + return result.to_array().astype('intp') + + def get_indexer_non_unique(self, scalar_t[:] target): + """Return the positions corresponding to intervals that overlap with + the given array of scalar targets. Non-unique positions are repeated. + """ + cdef: + Py_ssize_t old_len + Py_ssize_t i + Int64Vector result, missing + + result = Int64Vector() + missing = Int64Vector() + old_len = 0 + for i in range(len(target)): + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + + if result.data.n == old_len: + result.append(-1) + missing.append(i) + old_len = result.data.n + return (result.to_array().astype('intp'), + missing.to_array().astype('intp')) + + def __repr__(self) -> str: + return ('<IntervalTree[{dtype},{closed}]: ' + '{n_elements} elements>'.format( + dtype=self.dtype, closed=self.closed, + n_elements=self.root.n_elements)) + + # compat with IndexEngine interface + def clear_mapping(self) -> None: + pass + + +cdef take(ndarray source, ndarray indices): + """Take the given positions from a 1D ndarray + """ + return PyArray_Take(source, indices, 0) + + +cdef sort_values_and_indices(all_values, all_indices, subset): + indices = take(all_indices, subset) + values = take(all_values, subset) + sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT) + sorted_values = take(values, sorter) + sorted_indices = take(indices, sorter) + return sorted_values, sorted_indices + + +# ---------------------------------------------------------------------- +# Nodes +# ---------------------------------------------------------------------- + +@cython.internal +cdef class IntervalNode: + cdef readonly: + int64_t n_elements, n_center, leaf_size + bint is_leaf_node + + def __repr__(self) -> str: + if self.is_leaf_node: + return ( + f"<{type(self).__name__}: {self.n_elements} elements (terminal)>" + ) + else: + n_left = self.left_node.n_elements + n_right = self.right_node.n_elements + n_center = self.n_elements - n_left - n_right + return ( + f"<{type(self).__name__}: " + f"pivot {self.pivot}, {self.n_elements} elements " + f"({n_left} left, {n_right} right, {n_center} overlapping)>" + ) + + def counts(self): + """ + Inspect counts on this node + useful for debugging purposes + """ + if self.is_leaf_node: + return self.n_elements + else: + m = len(self.center_left_values) + l = self.left_node.counts() + r = self.right_node.counts() + return (m, (l, r)) + + +# we need specialized nodes and leaves to optimize for different dtype and +# closed values + +{{py: + +nodes = [] +for dtype in ['float64', 'int64', 'uint64']: + for closed, cmp_left, cmp_right in [ + ('left', '<=', '<'), + ('right', '<', '<='), + ('both', '<=', '<='), + ('neither', '<', '<')]: + cmp_left_converse = '<' if cmp_left == '<=' else '<=' + cmp_right_converse = '<' if cmp_right == '<=' else '<=' + if dtype.startswith('int'): + fused_prefix = 'int_' + elif dtype.startswith('uint'): + fused_prefix = 'uint_' + elif dtype.startswith('float'): + fused_prefix = '' + nodes.append((dtype, dtype.title(), + closed, closed.title(), + cmp_left, + cmp_right, + cmp_left_converse, + cmp_right_converse, + fused_prefix)) + +}} + +NODE_CLASSES = {} + +{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, + cmp_left_converse, cmp_right_converse, fused_prefix in nodes}} + + +@cython.internal +cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode(IntervalNode): + """Non-terminal node for an IntervalTree + + Categorizes intervals by those that fall to the left, those that fall to + the right, and those that overlap with the pivot. + """ + cdef readonly: + {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node + {{dtype}}_t[:] center_left_values, center_right_values, left, right + int64_t[:] center_left_indices, center_right_indices, indices + {{dtype}}_t min_left, max_right + {{dtype}}_t pivot + + def __init__(self, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + int64_t leaf_size): + + self.n_elements = len(left) + self.leaf_size = leaf_size + + # min_left and min_right are used to speed-up query by skipping + # query on sub-nodes. If this node has size 0, query is cheap, + # so these values don't matter. + if left.size > 0: + self.min_left = left.min() + self.max_right = right.max() + else: + self.min_left = 0 + self.max_right = 0 + + if self.n_elements <= leaf_size: + # make this a terminal (leaf) node + self.is_leaf_node = True + self.left = left + self.right = right + self.indices = indices + self.n_center = 0 + else: + # calculate a pivot so we can create child nodes + self.is_leaf_node = False + self.pivot = np.median(left / 2 + right / 2) + if np.isinf(self.pivot): + self.pivot = cython.cast({{dtype}}_t, 0) + if self.pivot > np.max(right): + self.pivot = np.max(left) + if self.pivot < np.min(left): + self.pivot = np.min(right) + + left_set, right_set, center_set = self.classify_intervals( + left, right) + + self.left_node = self.new_child_node(left, right, + indices, left_set) + self.right_node = self.new_child_node(left, right, + indices, right_set) + + self.center_left_values, self.center_left_indices = \ + sort_values_and_indices(left, indices, center_set) + self.center_right_values, self.center_right_indices = \ + sort_values_and_indices(right, indices, center_set) + self.n_center = len(self.center_left_indices) + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right): + """Classify the given intervals based upon whether they fall to the + left, right, or overlap with this node's pivot. + """ + cdef: + Int64Vector left_ind, right_ind, overlapping_ind + Py_ssize_t i + + left_ind = Int64Vector() + right_ind = Int64Vector() + overlapping_ind = Int64Vector() + + for i in range(self.n_elements): + if right[i] {{cmp_right_converse}} self.pivot: + left_ind.append(i) + elif self.pivot {{cmp_left_converse}} left[i]: + right_ind.append(i) + else: + overlapping_ind.append(i) + + return (left_ind.to_array(), + right_ind.to_array(), + overlapping_ind.to_array()) + + cdef new_child_node(self, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, + ndarray[int64_t, ndim=1] indices, + ndarray[int64_t, ndim=1] subset): + """Create a new child node. + """ + left = take(left, subset) + right = take(right, subset) + indices = take(indices, subset) + return {{dtype_title}}Closed{{closed_title}}IntervalNode( + left, right, indices, self.leaf_size) + + @cython.wraparound(False) + @cython.boundscheck(False) + @cython.initializedcheck(False) + cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point): + """Recursively query this node and its sub-nodes for intervals that + overlap with the query point. + """ + cdef: + int64_t[:] indices + {{dtype}}_t[:] values + Py_ssize_t i + + if self.is_leaf_node: + # Once we get down to a certain size, it doesn't make sense to + # continue the binary tree structure. Instead, we use linear + # search. + for i in range(self.n_elements): + if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]: + result.append(self.indices[i]) + else: + # There are child nodes. Based on comparing our query to the pivot, + # look at the center values, then go to the relevant child. + if point < self.pivot: + values = self.center_left_values + indices = self.center_left_indices + for i in range(self.n_center): + if not values[i] {{cmp_left}} point: + break + result.append(indices[i]) + if point {{cmp_right}} self.left_node.max_right: + self.left_node.query(result, point) + elif point > self.pivot: + values = self.center_right_values + indices = self.center_right_indices + for i in range(self.n_center - 1, -1, -1): + if not point {{cmp_right}} values[i]: + break + result.append(indices[i]) + if self.right_node.min_left {{cmp_left}} point: + self.right_node.query(result, point) + else: + result.extend(self.center_left_indices) + + +NODE_CLASSES['{{dtype}}', + '{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode + +{{endfor}} diff --git a/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in new file mode 100644 index 0000000000..d0934b3e0e --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -0,0 +1,44 @@ +""" +Template for wrapping khash-tables for each primitive `dtype` + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +{{py: + +# name, c_type +primitive_types = [('int64', 'int64_t'), + ('uint64', 'uint64_t'), + ('float64', 'float64_t'), + ('int32', 'int32_t'), + ('uint32', 'uint32_t'), + ('float32', 'float32_t'), + ('int16', 'int16_t'), + ('uint16', 'uint16_t'), + ('int8', 'int8_t'), + ('uint8', 'uint8_t'), + ('complex64', 'khcomplex64_t'), + ('complex128', 'khcomplex128_t'), + ] +}} + +{{for name, c_type in primitive_types}} + +cdef extern from "khash_python.h": + ctypedef struct kh_{{name}}_t: + khuint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + {{c_type}} *keys + size_t *vals + + kh_{{name}}_t* kh_init_{{name}}() nogil + void kh_destroy_{{name}}(kh_{{name}}_t*) nogil + void kh_clear_{{name}}(kh_{{name}}_t*) nogil + khuint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil + void kh_resize_{{name}}(kh_{{name}}_t*, khuint_t) nogil + khuint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil + void kh_del_{{name}}(kh_{{name}}_t*, khuint_t) nogil + + bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil + +{{endfor}} diff --git a/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in new file mode 100644 index 0000000000..774a8c579f --- /dev/null +++ b/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in @@ -0,0 +1,313 @@ +""" +Template for each `dtype` helper function for sparse ops + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +# ---------------------------------------------------------------------- +# Sparse op +# ---------------------------------------------------------------------- + +ctypedef fused sparse_t: + float64_t + int64_t + + +cdef float64_t __div__(sparse_t a, sparse_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + + +cdef float64_t __truediv__(sparse_t a, sparse_t b): + return __div__(a, b) + + +cdef sparse_t __mod__(sparse_t a, sparse_t b): + if b == 0: + if sparse_t is float64_t: + return NaN + else: + return 0 + else: + return a % b + + +cdef sparse_t __floordiv__(sparse_t a, sparse_t b): + if b == 0: + if sparse_t is float64_t: + # Match non-sparse Series behavior implemented in mask_zero_div_zero + if a > 0: + return INF + elif a < 0: + return -INF + return NaN + else: + return 0 + else: + return a // b + + +# ---------------------------------------------------------------------- +# sparse array op +# ---------------------------------------------------------------------- + +{{py: + +# dtype, arith_comp_group, logical_group +dtypes = [('float64', True, False), + ('int64', True, True), + ('uint8', False, True)] +# do not generate arithmetic / comparison template for uint8, +# it should be done in fused types + +def get_op(tup): + assert isinstance(tup, tuple) + assert len(tup) == 4 + + opname, lval, rval, dtype = tup + + ops_dict = {'add': '{0} + {1}', + 'sub': '{0} - {1}', + 'mul': '{0} * {1}', + 'div': '__div__({0}, {1})', + 'mod': '__mod__({0}, {1})', + 'truediv': '__truediv__({0}, {1})', + 'floordiv': '__floordiv__({0}, {1})', + 'pow': '{0} ** {1}', + 'eq': '{0} == {1}', + 'ne': '{0} != {1}', + 'lt': '{0} < {1}', + 'gt': '{0} > {1}', + 'le': '{0} <= {1}', + 'ge': '{0} >= {1}', + + 'and': '{0} & {1}', # logical op + 'or': '{0} | {1}', + 'xor': '{0} ^ {1}'} + + return ops_dict[opname].format(lval, rval) + + +def get_dispatch(dtypes): + + ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv', + 'floordiv', 'pow', + 'eq', 'ne', 'lt', 'gt', 'le', 'ge', + 'and', 'or', 'xor'] + + for opname in ops_list: + for dtype, arith_comp_group, logical_group in dtypes: + + if opname in ('div', 'truediv'): + rdtype = 'float64' + elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + # comparison op + rdtype = 'uint8' + elif opname in ('and', 'or', 'xor'): + # logical op + rdtype = 'uint8' + else: + rdtype = dtype + + if opname in ('and', 'or', 'xor'): + if logical_group: + yield opname, dtype, rdtype + else: + if arith_comp_group: + yield opname, dtype, rdtype + +}} + + +{{for opname, dtype, rdtype in get_dispatch(dtypes)}} + +{{if opname == "pow"}} +@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here +{{endif}} +@cython.wraparound(False) +@cython.boundscheck(False) +cdef tuple block_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_, + BlockIndex xindex, + {{dtype}}_t xfill, + {{dtype}}_t[:] y_, + BlockIndex yindex, + {{dtype}}_t yfill): + """ + Binary operator on BlockIndex objects with fill values + """ + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + {{dtype}}_t[:] x, y + ndarray[{{rdtype}}_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.{{rdtype}}) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i in range(out_index.npoints): + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}} + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}} + +{{if opname == "pow"}} +@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here +{{endif}} +@cython.wraparound(False) +@cython.boundscheck(False) +cdef tuple int_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_, + IntIndex xindex, + {{dtype}}_t xfill, + {{dtype}}_t[:] y_, + IntIndex yindex, + {{dtype}}_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + int32_t[:] xindices, yindices, out_indices + {{dtype}}_t[:] x, y + ndarray[{{rdtype}}_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.{{rdtype}}) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i in range(out_index.npoints): + if xi == xindex.npoints: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}} + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + else: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}} + + +cpdef sparse_{{opname}}_{{dtype}}({{dtype}}_t[:] x, + SparseIndex xindex, {{dtype}}_t xfill, + {{dtype}}_t[:] y, + SparseIndex yindex, {{dtype}}_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + +{{endfor}} diff --git a/contrib/python/pandas/py3/patches/01-arcadia.patch b/contrib/python/pandas/py3/patches/01-arcadia.patch new file mode 100644 index 0000000000..b228067b37 --- /dev/null +++ b/contrib/python/pandas/py3/patches/01-arcadia.patch @@ -0,0 +1,33 @@ +--- contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h (index) ++++ contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h (working tree) +@@ -23,1 +23,1 @@ typedef npy_complex128 khcomplex128_t; +-void *traced_malloc(size_t size){ ++static void *traced_malloc(size_t size){ +@@ -31,1 +31,1 @@ void *traced_malloc(size_t size){ +-void *traced_calloc(size_t num, size_t size){ ++static void *traced_calloc(size_t num, size_t size){ +@@ -39,1 +39,1 @@ void *traced_calloc(size_t num, size_t size){ +-void *traced_realloc(void* old_ptr, size_t size){ ++static void *traced_realloc(void* old_ptr, size_t size){ +@@ -50,1 +50,1 @@ void *traced_realloc(void* old_ptr, size_t size){ +-void traced_free(void* ptr){ ++static void traced_free(void* ptr){ +--- contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c (index) ++++ contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c (working tree) +@@ -9,2 +9,2 @@ The full license is in the LICENSE file, distributed with this software. +-#include <../../../tslibs/src/datetime/np_datetime.h> +-#include <../../../tslibs/src/datetime/np_datetime_strings.h> ++#include "../../../tslibs/src/datetime/np_datetime.h" ++#include "../../../tslibs/src/datetime/np_datetime_strings.h" +--- contrib/python/pandas/py3/pandas/compat/_optional.py (index) ++++ contrib/python/pandas/py3/pandas/compat/_optional.py (working tree) +@@ -41,1 +41,1 @@ VERSIONS = { +- "sqlalchemy": "1.4.16", ++ "sqlalchemy": "1.2.0", +--- contrib/python/pandas/py3/ya.make (index) ++++ contrib/python/pandas/py3/ya.make (working tree) +@@ -44,2 +44,4 @@ CFLAGS( + ++INCLUDE(symbols.cmake) ++ + SRCS( |