aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/pandas/py3
diff options
context:
space:
mode:
authormaxim-yurchuk <maxim-yurchuk@yandex-team.com>2024-10-09 12:29:46 +0300
committermaxim-yurchuk <maxim-yurchuk@yandex-team.com>2024-10-09 13:14:22 +0300
commit9731d8a4bb7ee2cc8554eaf133bb85498a4c7d80 (patch)
treea8fb3181d5947c0d78cf402aa56e686130179049 /contrib/python/pandas/py3
parenta44b779cd359f06c3ebbef4ec98c6b38609d9d85 (diff)
downloadydb-9731d8a4bb7ee2cc8554eaf133bb85498a4c7d80.tar.gz
publishFullContrib: true for ydb
<HIDDEN_URL> commit_hash:c82a80ac4594723cebf2c7387dec9c60217f603e
Diffstat (limited to 'contrib/python/pandas/py3')
-rw-r--r--contrib/python/pandas/py3/.yandex_meta/yamaker.yaml17
-rw-r--r--contrib/python/pandas/py3/LICENSES/HAVEN_MIT32
-rw-r--r--contrib/python/pandas/py3/LICENSES/OTHER75
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in73
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in222
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in1506
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in484
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in78
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in434
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in44
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in313
-rw-r--r--contrib/python/pandas/py3/patches/01-arcadia.patch33
12 files changed, 3311 insertions, 0 deletions
diff --git a/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml b/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml
new file mode 100644
index 0000000000..b0e7556343
--- /dev/null
+++ b/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml
@@ -0,0 +1,17 @@
+copy:
+ - LICENSES/*
+exclude_from_macros:
+ - LICENSES/*
+ - symbols.cmake
+ - pandas/_libs/src/headers/cmath
+keep:
+ - symbols.cmake
+cython_directive:
+ - language_level=3
+cython_templates:
+ - pandas/*.pxi.in
+mark_as_cython_c:
+ - pandas/_libs/[!w]*.pyx
+ - pandas/_libs/window/indexers.pyx
+ - pandas/_libs/writers.pyx
+ - pandas/io/sas/sas.pyx
diff --git a/contrib/python/pandas/py3/LICENSES/HAVEN_MIT b/contrib/python/pandas/py3/LICENSES/HAVEN_MIT
new file mode 100644
index 0000000000..b03d0e6406
--- /dev/null
+++ b/contrib/python/pandas/py3/LICENSES/HAVEN_MIT
@@ -0,0 +1,32 @@
+Based on http://opensource.org/licenses/MIT
+
+This is a template. Complete and ship as file LICENSE the following 2
+lines (only)
+
+YEAR:
+COPYRIGHT HOLDER:
+
+and specify as
+
+License: MIT + file LICENSE
+
+Copyright (c) <YEAR>, <COPYRIGHT HOLDER>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/contrib/python/pandas/py3/LICENSES/OTHER b/contrib/python/pandas/py3/LICENSES/OTHER
new file mode 100644
index 0000000000..7446d68eb4
--- /dev/null
+++ b/contrib/python/pandas/py3/LICENSES/OTHER
@@ -0,0 +1,75 @@
+Bottleneck license
+------------------
+
+Copyright (c) 2010-2012 Archipel Asset Management AB.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+google-api-python-client license
+--------------------------------
+
+Copyright (C) 2012 Google Inc.
+All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Pyperclip v1.3 license
+----------------------
+
+Copyright (c) 2010, Albert Sweigart
+All rights reserved.
+
+BSD-style license:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the pyperclip nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in
new file mode 100644
index 0000000000..ee815b8bbf
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in
@@ -0,0 +1,73 @@
+"""
+Template for each `dtype` helper function using 1-d template
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# ensure_dtype
+# ----------------------------------------------------------------------
+
+
+def ensure_platform_int(object arr):
+ # GH3033, GH1392
+ # platform int is the size of the int pointer, e.g. np.intp
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == cnp.NPY_INTP:
+ return arr
+ else:
+ # equiv: arr.astype(np.intp)
+ return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INTP)
+ else:
+ return np.array(arr, dtype=np.intp)
+
+
+def ensure_object(object arr):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_OBJECT:
+ return arr
+ else:
+ # equiv: arr.astype(object)
+ return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
+ else:
+ return np.array(arr, dtype=np.object_)
+
+{{py:
+
+# name, c_type, dtype
+dtypes = [('float64', 'FLOAT64', 'float64'),
+ # ('float32', 'FLOAT32', 'float32'), # disabling bc unused
+ ('int8', 'INT8', 'int8'),
+ ('int16', 'INT16', 'int16'),
+ ('int32', 'INT32', 'int32'),
+ ('int64', 'INT64', 'int64'),
+ ('uint64', 'UINT64', 'uint64'),
+ # Disabling uint and complex dtypes because we do not use them
+ # (and compiling them increases wheel size) (except uint64)
+ # ('uint8', 'UINT8', 'uint8'),
+ # ('uint16', 'UINT16', 'uint16'),
+ # ('uint32', 'UINT32', 'uint32'),
+ # ('complex64', 'COMPLEX64', 'complex64'),
+ # ('complex128', 'COMPLEX128', 'complex128')
+]
+
+def get_dispatch(dtypes):
+
+ for name, c_type, dtype in dtypes:
+ yield name, c_type, dtype
+}}
+
+{{for name, c_type, dtype in get_dispatch(dtypes)}}
+
+
+def ensure_{{name}}(object arr):
+ if util.is_array(arr):
+ if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
+ return arr
+ else:
+ # equiv: arr.astype(np.{{dtype}})
+ return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_{{c_type}})
+ else:
+ return np.asarray(arr, dtype=np.{{dtype}})
+
+{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in
new file mode 100644
index 0000000000..2a3858674a
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in
@@ -0,0 +1,222 @@
+"""
+Template for each `dtype` helper function for take
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# take_1d, take_2d
+# ----------------------------------------------------------------------
+
+
+{{py:
+
+# c_type_in, c_type_out
+dtypes = [
+ ('uint8_t', 'uint8_t'),
+ ('uint8_t', 'object'),
+ ('int8_t', 'int8_t'),
+ ('int8_t', 'int32_t'),
+ ('int8_t', 'int64_t'),
+ ('int8_t', 'float64_t'),
+ ('int16_t', 'int16_t'),
+ ('int16_t', 'int32_t'),
+ ('int16_t', 'int64_t'),
+ ('int16_t', 'float64_t'),
+ ('int32_t', 'int32_t'),
+ ('int32_t', 'int64_t'),
+ ('int32_t', 'float64_t'),
+ ('int64_t', 'int64_t'),
+ ('int64_t', 'float64_t'),
+ ('float32_t', 'float32_t'),
+ ('float32_t', 'float64_t'),
+ ('float64_t', 'float64_t'),
+ ('object', 'object'),
+]
+
+
+def get_dispatch(dtypes):
+
+ for (c_type_in, c_type_out) in dtypes:
+
+ def get_name(dtype_name):
+ if dtype_name == "object":
+ return "object"
+ if dtype_name == "uint8_t":
+ return "bool"
+ return dtype_name[:-2]
+
+ name = get_name(c_type_in)
+ dest = get_name(c_type_out)
+
+ args = dict(name=name, dest=dest, c_type_in=c_type_in,
+ c_type_out=c_type_out)
+
+ yield (name, dest, c_type_in, c_type_out)
+
+}}
+
+
+{{for name, dest, c_type_in, c_type_out in get_dispatch(dtypes)}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+{{if c_type_in != "object"}}
+def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values,
+{{else}}
+def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
+{{endif}}
+ const intp_t[:] indexer,
+ {{c_type_out}}[:] out,
+ fill_value=np.nan):
+
+ cdef:
+ Py_ssize_t i, n, idx
+ {{c_type_out}} fv
+
+ n = indexer.shape[0]
+
+ fv = fill_value
+
+ {{if c_type_out != "object"}}
+ with nogil:
+ {{else}}
+ if True:
+ {{endif}}
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ out[i] = fv
+ else:
+ {{if c_type_in == "uint8_t" and c_type_out == "object"}}
+ out[i] = True if values[idx] > 0 else False
+ {{else}}
+ out[i] = values[idx]
+ {{endif}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+{{if c_type_in != "object"}}
+def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
+{{else}}
+def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+{{endif}}
+ ndarray[intp_t, ndim=1] indexer,
+ {{c_type_out}}[:, :] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ {{c_type_out}} fv
+ {{if c_type_in == c_type_out != "object"}}
+ const {{c_type_out}} *v
+ {{c_type_out}} *o
+ {{endif}}
+
+ n = len(indexer)
+ k = values.shape[1]
+
+ fv = fill_value
+
+ {{if c_type_in == c_type_out != "object"}}
+ # GH#3130
+ if (values.strides[1] == out.strides[1] and
+ values.strides[1] == sizeof({{c_type_out}}) and
+ sizeof({{c_type_out}}) * n >= 256):
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ v = &values[idx, 0]
+ o = &out[i, 0]
+ memmove(o, v, <size_t>(sizeof({{c_type_out}}) * k))
+ return
+ {{endif}}
+
+ for i in range(n):
+ idx = indexer[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ {{if c_type_in == "uint8_t" and c_type_out == "object"}}
+ out[i, j] = True if values[idx, j] > 0 else False
+ {{else}}
+ out[i, j] = values[idx, j]
+ {{endif}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+{{if c_type_in != "object"}}
+def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
+{{else}}
+def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+{{endif}}
+ ndarray[intp_t, ndim=1] indexer,
+ {{c_type_out}}[:, :] out,
+ fill_value=np.nan):
+
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ {{c_type_out}} fv
+
+ n = len(values)
+ k = len(indexer)
+
+ if n == 0 or k == 0:
+ return
+
+ fv = fill_value
+
+ for i in range(n):
+ for j in range(k):
+ idx = indexer[j]
+ if idx == -1:
+ out[i, j] = fv
+ else:
+ {{if c_type_in == "uint8_t" and c_type_out == "object"}}
+ out[i, j] = True if values[i, idx] > 0 else False
+ {{else}}
+ out[i, j] = values[i, idx]
+ {{endif}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+ indexer,
+ ndarray[{{c_type_out}}, ndim=2] out,
+ fill_value=np.nan):
+ cdef:
+ Py_ssize_t i, j, k, n, idx
+ ndarray[intp_t, ndim=1] idx0 = indexer[0]
+ ndarray[intp_t, ndim=1] idx1 = indexer[1]
+ {{c_type_out}} fv
+
+ n = len(idx0)
+ k = len(idx1)
+
+ fv = fill_value
+ for i in range(n):
+ idx = idx0[i]
+ if idx == -1:
+ for j in range(k):
+ out[i, j] = fv
+ else:
+ for j in range(k):
+ if idx1[j] == -1:
+ out[i, j] = fv
+ else:
+ {{if c_type_in == "uint8_t" and c_type_out == "object"}}
+ out[i, j] = True if values[idx, idx1[j]] > 0 else False
+ {{else}}
+ out[i, j] = values[idx, idx1[j]]
+ {{endif}}
+
+{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in
new file mode 100644
index 0000000000..d4d3117a32
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in
@@ -0,0 +1,1506 @@
+"""
+Template for each `dtype` helper function for hashtable
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+
+{{py:
+
+# name
+complex_types = ['complex64',
+ 'complex128']
+}}
+
+{{for name in complex_types}}
+cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
+ cdef kh{{name}}_t res
+ res.real = val.real
+ res.imag = val.imag
+ return res
+
+{{endfor}}
+
+
+{{py:
+
+
+# name
+c_types = ['khcomplex128_t',
+ 'khcomplex64_t',
+ 'float64_t',
+ 'float32_t',
+ 'int64_t',
+ 'int32_t',
+ 'int16_t',
+ 'int8_t',
+ 'uint64_t',
+ 'uint32_t',
+ 'uint16_t',
+ 'uint8_t']
+}}
+
+{{for c_type in c_types}}
+
+cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
+ {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
+ return val.real != val.real or val.imag != val.imag
+ {{elif c_type in {'float64_t', 'float32_t'} }}
+ return val != val
+ {{else}}
+ return False
+ {{endif}}
+
+
+{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
+# are_equivalent_{{c_type}} is cimported via khash.pxd
+{{else}}
+cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
+ return val1 == val2
+{{endif}}
+
+{{endfor}}
+
+
+{{py:
+
+# name
+cimported_types = ['complex64',
+ 'complex128',
+ 'float32',
+ 'float64',
+ 'int8',
+ 'int16',
+ 'int32',
+ 'int64',
+ 'pymap',
+ 'str',
+ 'strbox',
+ 'uint8',
+ 'uint16',
+ 'uint32',
+ 'uint64']
+}}
+
+{{for name in cimported_types}}
+from pandas._libs.khash cimport (
+ kh_destroy_{{name}},
+ kh_exist_{{name}},
+ kh_get_{{name}},
+ kh_init_{{name}},
+ kh_put_{{name}},
+ kh_resize_{{name}},
+)
+
+{{endfor}}
+
+# ----------------------------------------------------------------------
+# VectorData
+# ----------------------------------------------------------------------
+
+from pandas._libs.tslibs.util cimport get_c_string
+from pandas._libs.missing cimport C_NA
+
+
+{{py:
+
+# name, dtype, c_type
+# the generated StringVector is not actually used
+# but is included for completeness (rather ObjectVector is used
+# for uniques in hashtables)
+
+dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
+ ('Complex64', 'complex64', 'khcomplex64_t'),
+ ('Float64', 'float64', 'float64_t'),
+ ('Float32', 'float32', 'float32_t'),
+ ('Int64', 'int64', 'int64_t'),
+ ('Int32', 'int32', 'int32_t'),
+ ('Int16', 'int16', 'int16_t'),
+ ('Int8', 'int8', 'int8_t'),
+ ('String', 'string', 'char *'),
+ ('UInt64', 'uint64', 'uint64_t'),
+ ('UInt32', 'uint32', 'uint32_t'),
+ ('UInt16', 'uint16', 'uint16_t'),
+ ('UInt8', 'uint8', 'uint8_t')]
+}}
+
+{{for name, dtype, c_type in dtypes}}
+
+
+{{if dtype != 'int64'}}
+# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
+# by IntervalTree
+
+ctypedef struct {{name}}VectorData:
+ {{c_type}} *data
+ Py_ssize_t n, m
+
+{{endif}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef void append_data_{{dtype}}({{name}}VectorData *data,
+ {{c_type}} x) nogil:
+
+ data.data[data.n] = x
+ data.n += 1
+
+{{endfor}}
+
+ctypedef fused vector_data:
+ Int64VectorData
+ Int32VectorData
+ Int16VectorData
+ Int8VectorData
+ UInt64VectorData
+ UInt32VectorData
+ UInt16VectorData
+ UInt8VectorData
+ Float64VectorData
+ Float32VectorData
+ Complex128VectorData
+ Complex64VectorData
+ StringVectorData
+
+cdef bint needs_resize(vector_data *data) nogil:
+ return data.n == data.m
+
+# ----------------------------------------------------------------------
+# Vector
+# ----------------------------------------------------------------------
+
+cdef class Vector:
+ # cdef readonly:
+ # bint external_view_exists
+
+ def __cinit__(self):
+ self.external_view_exists = False
+
+
+{{py:
+
+# name, dtype, c_type
+dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
+ ('Complex64', 'complex64', 'khcomplex64_t'),
+ ('Float64', 'float64', 'float64_t'),
+ ('UInt64', 'uint64', 'uint64_t'),
+ ('Int64', 'int64', 'int64_t'),
+ ('Float32', 'float32', 'float32_t'),
+ ('UInt32', 'uint32', 'uint32_t'),
+ ('Int32', 'int32', 'int32_t'),
+ ('UInt16', 'uint16', 'uint16_t'),
+ ('Int16', 'int16', 'int16_t'),
+ ('UInt8', 'uint8', 'uint8_t'),
+ ('Int8', 'int8', 'int8_t')]
+
+}}
+
+{{for name, dtype, c_type in dtypes}}
+
+cdef class {{name}}Vector(Vector):
+
+ # For int64 we have to put this declaration in the .pxd file;
+ # Int64Vector is the only one we need exposed for other cython files.
+ {{if dtype != 'int64'}}
+ cdef:
+ {{name}}VectorData *data
+ ndarray ao
+ {{endif}}
+
+ def __cinit__(self):
+ self.data = <{{name}}VectorData *>PyMem_Malloc(
+ sizeof({{name}}VectorData))
+ if not self.data:
+ raise MemoryError()
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
+ self.data.data = <{{c_type}}*>self.ao.data
+
+ cdef resize(self):
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+ self.ao.resize(self.data.m, refcheck=False)
+ self.data.data = <{{c_type}}*>self.ao.data
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self) -> int:
+ return self.data.n
+
+ cpdef ndarray to_array(self):
+ if self.data.m != self.data.n:
+ if self.external_view_exists:
+ # should never happen
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.data.n, refcheck=False)
+ self.data.m = self.data.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef void append(self, {{c_type}} x):
+
+ if needs_resize(self.data):
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.resize()
+
+ append_data_{{dtype}}(self.data, x)
+
+ cdef extend(self, const {{c_type}}[:] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+{{endfor}}
+
+cdef class StringVector(Vector):
+
+ cdef:
+ StringVectorData *data
+
+ def __cinit__(self):
+ self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
+ if not self.data:
+ raise MemoryError()
+ self.data.n = 0
+ self.data.m = _INIT_VEC_CAP
+ self.data.data = <char **>malloc(self.data.m * sizeof(char *))
+ if not self.data.data:
+ raise MemoryError()
+
+ cdef resize(self):
+ cdef:
+ char **orig_data
+ Py_ssize_t i, m
+
+ m = self.data.m
+ self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+
+ orig_data = self.data.data
+ self.data.data = <char **>malloc(self.data.m * sizeof(char *))
+ if not self.data.data:
+ raise MemoryError()
+ for i in range(m):
+ self.data.data[i] = orig_data[i]
+
+ def __dealloc__(self):
+ if self.data is not NULL:
+ if self.data.data is not NULL:
+ free(self.data.data)
+ PyMem_Free(self.data)
+ self.data = NULL
+
+ def __len__(self) -> int:
+ return self.data.n
+
+ cpdef ndarray[object, ndim=1] to_array(self):
+ cdef:
+ ndarray ao
+ Py_ssize_t n
+ object val
+
+ ao = np.empty(self.data.n, dtype=object)
+ for i in range(self.data.n):
+ val = self.data.data[i]
+ ao[i] = val
+ self.external_view_exists = True
+ self.data.m = self.data.n
+ return ao
+
+ cdef void append(self, char *x):
+
+ if needs_resize(self.data):
+ self.resize()
+
+ append_data_string(self.data, x)
+
+ cdef extend(self, ndarray[object] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+
+cdef class ObjectVector(Vector):
+
+ cdef:
+ PyObject **data
+ Py_ssize_t n, m
+ ndarray ao
+
+ def __cinit__(self):
+ self.n = 0
+ self.m = _INIT_VEC_CAP
+ self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
+ self.data = <PyObject**>self.ao.data
+
+ def __len__(self) -> int:
+ return self.n
+
+ cdef append(self, object obj):
+ if self.n == self.m:
+ if self.external_view_exists:
+ raise ValueError("external reference but "
+ "Vector.resize() needed")
+ self.m = max(self.m * 2, _INIT_VEC_CAP)
+ self.ao.resize(self.m, refcheck=False)
+ self.data = <PyObject**>self.ao.data
+
+ Py_INCREF(obj)
+ self.data[self.n] = <PyObject*>obj
+ self.n += 1
+
+ cpdef ndarray[object, ndim=1] to_array(self):
+ if self.m != self.n:
+ if self.external_view_exists:
+ raise ValueError("should have raised on append()")
+ self.ao.resize(self.n, refcheck=False)
+ self.m = self.n
+ self.external_view_exists = True
+ return self.ao
+
+ cdef extend(self, ndarray[object] x):
+ for i in range(len(x)):
+ self.append(x[i])
+
+# ----------------------------------------------------------------------
+# HashTable
+# ----------------------------------------------------------------------
+
+
+cdef class HashTable:
+
+ pass
+
+{{py:
+
+# name, dtype, c_type, to_c_type
+dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
+ ('Float64', 'float64', 'float64_t', ''),
+ ('UInt64', 'uint64', 'uint64_t', ''),
+ ('Int64', 'int64', 'int64_t', ''),
+ ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
+ ('Float32', 'float32', 'float32_t', ''),
+ ('UInt32', 'uint32', 'uint32_t', ''),
+ ('Int32', 'int32', 'int32_t', ''),
+ ('UInt16', 'uint16', 'uint16_t', ''),
+ ('Int16', 'int16', 'int16_t', ''),
+ ('UInt8', 'uint8', 'uint8_t', ''),
+ ('Int8', 'int8', 'int8_t', '')]
+
+}}
+
+
+{{for name, dtype, c_type, to_c_type in dtypes}}
+
+cdef class {{name}}HashTable(HashTable):
+
+ def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
+ self.table = kh_init_{{dtype}}()
+ size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+ kh_resize_{{dtype}}(self.table, size_hint)
+
+ self.uses_mask = uses_mask
+ self.na_position = -1
+
+ def __len__(self) -> int:
+ return self.table.size + (0 if self.na_position == -1 else 1)
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_{{dtype}}(self.table)
+ self.table = NULL
+
+ def __contains__(self, object key) -> bool:
+ # The caller is responsible to check for compatible NA values in case
+ # of masked arrays.
+ cdef:
+ khiter_t k
+ {{c_type}} ckey
+
+ if self.uses_mask and checknull(key):
+ return -1 != self.na_position
+
+ ckey = {{to_c_type}}(key)
+ k = kh_get_{{dtype}}(self.table, ckey)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep: bool = False) -> int:
+ """ return the size of my table in bytes """
+ overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
+ for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
+ for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
+ sizeof(Py_ssize_t)) # vals
+ return overhead + for_flags + for_pairs
+
+ def get_state(self) -> dict[str, int]:
+ """ returns infos about the state of the hashtable"""
+ return {
+ 'n_buckets' : self.table.n_buckets,
+ 'size' : self.table.size,
+ 'n_occupied' : self.table.n_occupied,
+ 'upper_bound' : self.table.upper_bound,
+ }
+
+ cpdef get_item(self, {{dtype}}_t val):
+ """Extracts the position of val from the hashtable.
+
+ Parameters
+ ----------
+ val : Scalar
+ The value that is looked up in the hashtable
+
+ Returns
+ -------
+ The position of the requested integer.
+ """
+
+ # Used in core.sorting, IndexEngine.get_loc
+ # Caller is responsible for checking for pd.NA
+ cdef:
+ khiter_t k
+ {{c_type}} cval
+
+ cval = {{to_c_type}}(val)
+ k = kh_get_{{dtype}}(self.table, cval)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef get_na(self):
+ """Extracts the position of na_value from the hashtable.
+
+ Returns
+ -------
+ The position of the last na value.
+ """
+
+ if not self.uses_mask:
+ raise NotImplementedError
+
+ if self.na_position == -1:
+ raise KeyError("NA")
+ return self.na_position
+
+ cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
+ # Used in libjoin
+ # Caller is responsible for checking for pd.NA
+ cdef:
+ khiter_t k
+ int ret = 0
+ {{c_type}} ckey
+
+ ckey = {{to_c_type}}(key)
+ k = kh_put_{{dtype}}(self.table, ckey, &ret)
+ if kh_exist_{{dtype}}(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ cpdef set_na(self, Py_ssize_t val):
+ # Caller is responsible for checking for pd.NA
+ cdef:
+ khiter_t k
+ int ret = 0
+ {{c_type}} ckey
+
+ if not self.uses_mask:
+ raise NotImplementedError
+
+ self.na_position = val
+
+ {{if dtype == "int64" }}
+ # We only use this for int64, can reduce build size and make .pyi
+ # more accurate by only implementing it for int64
+ @cython.boundscheck(False)
+ def map_keys_to_values(
+ self, const {{dtype}}_t[:] keys, const int64_t[:] values
+ ) -> None:
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ {{c_type}} key
+ khiter_t k
+
+ with nogil:
+ for i in range(n):
+ key = {{to_c_type}}(keys[i])
+ k = kh_put_{{dtype}}(self.table, key, &ret)
+ self.table.vals[k] = <Py_ssize_t>values[i]
+ {{endif}}
+
+ @cython.boundscheck(False)
+ def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
+ # Used in libindex, safe_sort
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ {{c_type}} val
+ khiter_t k
+ int8_t na_position = self.na_position
+
+ if self.uses_mask and mask is None:
+ raise NotImplementedError # pragma: no cover
+
+ with nogil:
+ if self.uses_mask:
+ for i in range(n):
+ if mask[i]:
+ na_position = i
+ else:
+ val= {{to_c_type}}(values[i])
+ k = kh_put_{{dtype}}(self.table, val, &ret)
+ self.table.vals[k] = i
+ else:
+ for i in range(n):
+ val= {{to_c_type}}(values[i])
+ k = kh_put_{{dtype}}(self.table, val, &ret)
+ self.table.vals[k] = i
+ self.na_position = na_position
+
+ @cython.boundscheck(False)
+ def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
+ # -> np.ndarray[np.intp]
+ # Used in safe_sort, IndexEngine.get_indexer
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ {{c_type}} val
+ khiter_t k
+ intp_t[::1] locs = np.empty(n, dtype=np.intp)
+ int8_t na_position = self.na_position
+
+ if self.uses_mask and mask is None:
+ raise NotImplementedError # pragma: no cover
+
+ with nogil:
+ for i in range(n):
+ if self.uses_mask and mask[i]:
+ locs[i] = na_position
+ else:
+ val = {{to_c_type}}(values[i])
+ k = kh_get_{{dtype}}(self.table, val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ object mask=None, bint return_inverse=False, bint use_result_mask=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[{{dtype}}]
+ Array of values of which unique will be calculated
+ uniques : {{name}}Vector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : bool, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ mask : ndarray[bool], optional
+ If not None, the mask is used as indicator for missing values
+ (True = missing, False = valid) instead of `na_value` or
+ condition "val != val".
+ return_inverse : bool, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+ use_result_mask: bool, default False
+ Whether to create a result mask for the unique values. Not supported
+ with return_inverse=True.
+
+ Returns
+ -------
+ uniques : ndarray[{{dtype}}]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t] (if return_inverse=True)
+ The labels from values to uniques
+ result_mask: ndarray[bool], if use_result_mask is true
+ The mask for the result values.
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ intp_t[::1] labels
+ int ret = 0
+ {{c_type}} val, na_value2
+ khiter_t k
+ {{name}}VectorData *ud
+ UInt8Vector result_mask
+ UInt8VectorData *rmd
+ bint use_na_value, use_mask, seen_na = False
+ uint8_t[:] mask_values
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.intp)
+ ud = uniques.data
+ use_na_value = na_value is not None
+ use_mask = mask is not None
+ if not use_mask and use_result_mask:
+ raise NotImplementedError # pragma: no cover
+
+ if use_result_mask and return_inverse:
+ raise NotImplementedError # pragma: no cover
+
+ result_mask = UInt8Vector()
+ rmd = result_mask.data
+
+ if use_mask:
+ mask_values = mask.view("uint8")
+
+ if use_na_value:
+ # We need this na_value2 because we want to allow users
+ # to *optionally* specify an NA sentinel *of the correct* type.
+ # We use None, to make it optional, which requires `object` type
+ # for the parameter. To please the compiler, we use na_value2,
+ # which is only used if it's *specified*.
+ na_value2 = {{to_c_type}}(na_value)
+ else:
+ na_value2 = {{to_c_type}}(0)
+
+ with nogil:
+ for i in range(n):
+ val = {{to_c_type}}(values[i])
+
+ if ignore_na and use_mask:
+ if mask_values[i]:
+ labels[i] = na_sentinel
+ continue
+ elif ignore_na and (
+ is_nan_{{c_type}}(val) or
+ (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
+ ):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them,
+ # and replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+ elif not ignore_na and use_result_mask:
+ if mask_values[i]:
+ if seen_na:
+ continue
+
+ seen_na = True
+ if needs_resize(ud):
+ with gil:
+ if uniques.external_view_exists:
+ raise ValueError("external reference to "
+ "uniques held, but "
+ "Vector.resize() needed")
+ uniques.resize()
+ if result_mask.external_view_exists:
+ raise ValueError("external reference to "
+ "result_mask held, but "
+ "Vector.resize() needed")
+ result_mask.resize()
+ append_data_{{dtype}}(ud, val)
+ append_data_uint8(rmd, 1)
+ continue
+
+ k = kh_get_{{dtype}}(self.table, val)
+
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_{{dtype}}(self.table, val, &ret)
+
+ if needs_resize(ud):
+ with gil:
+ if uniques.external_view_exists:
+ raise ValueError("external reference to "
+ "uniques held, but "
+ "Vector.resize() needed")
+ uniques.resize()
+ if use_result_mask:
+ if result_mask.external_view_exists:
+ raise ValueError("external reference to "
+ "result_mask held, but "
+ "Vector.resize() needed")
+ result_mask.resize()
+ append_data_{{dtype}}(ud, val)
+ if use_result_mask:
+ append_data_uint8(rmd, 0)
+
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), labels.base # .base -> underlying ndarray
+ if use_result_mask:
+ return uniques.to_array(), result_mask.to_array()
+ return uniques.to_array()
+
+ def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[{{dtype}}]
+ Array of values of which unique will be calculated
+ return_inverse : bool, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+ mask : ndarray[bool], optional
+ If not None, the mask is used as indicator for missing values
+ (True = missing, False = valid) instead of `na_value` or
+
+ Returns
+ -------
+ uniques : ndarray[{{dtype}}]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t] (if return_inverse)
+ The labels from values to uniques
+ result_mask: ndarray[bool], if mask is given as input
+ The mask for the result values.
+ """
+ uniques = {{name}}Vector()
+ use_result_mask = True if mask is not None else False
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
+
+ def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None, object mask=None, ignore_na=True):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[{{dtype}}]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ mask : ndarray[bool], optional
+ If not None, the mask is used as indicator for missing values
+ (True = missing, False = valid) instead of `na_value` or
+ condition "val != val".
+
+ Returns
+ -------
+ uniques : ndarray[{{dtype}}]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t]
+ The labels from values to uniques
+ """
+ uniques_vector = {{name}}Vector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=ignore_na, mask=mask,
+ return_inverse=True)
+
+ def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, object mask=None):
+ # -> np.ndarray[np.intp]
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True, mask=mask)
+ return labels
+
+ {{if dtype == 'int64'}}
+ @cython.boundscheck(False)
+ def get_labels_groupby(
+ self, const {{dtype}}_t[:] values
+ ) -> tuple[ndarray, ndarray]:
+ # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
+ cdef:
+ Py_ssize_t i, n = len(values)
+ intp_t[::1] labels
+ Py_ssize_t idx, count = 0
+ int ret = 0
+ {{c_type}} val
+ khiter_t k
+ {{name}}Vector uniques = {{name}}Vector()
+ {{name}}VectorData *ud
+
+ labels = np.empty(n, dtype=np.intp)
+ ud = uniques.data
+
+ with nogil:
+ for i in range(n):
+ val = {{to_c_type}}(values[i])
+
+ # specific for groupby
+ if val < 0:
+ labels[i] = -1
+ continue
+
+ k = kh_get_{{dtype}}(self.table, val)
+ if k != self.table.n_buckets:
+ idx = self.table.vals[k]
+ labels[i] = idx
+ else:
+ k = kh_put_{{dtype}}(self.table, val, &ret)
+ self.table.vals[k] = count
+
+ if needs_resize(ud):
+ with gil:
+ uniques.resize()
+ append_data_{{dtype}}(ud, val)
+ labels[i] = count
+ count += 1
+
+ arr_uniques = uniques.to_array()
+
+ return np.asarray(labels), arr_uniques
+ {{endif}}
+
+
+cdef class {{name}}Factorizer(Factorizer):
+ cdef public:
+ {{name}}HashTable table
+ {{name}}Vector uniques
+
+ def __cinit__(self, size_hint: int):
+ self.table = {{name}}HashTable(size_hint)
+ self.uniques = {{name}}Vector()
+
+ def factorize(self, const {{c_type}}[:] values,
+ na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
+ """
+ Returns
+ -------
+ ndarray[intp_t]
+
+ Examples
+ --------
+ Factorize values with nans replaced by na_sentinel
+
+ >>> fac = {{name}}Factorizer(3)
+ >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
+ array([0, 1, 2])
+ """
+ cdef:
+ ndarray[intp_t] labels
+
+ if self.uniques.external_view_exists:
+ uniques = {{name}}Vector()
+ uniques.extend(self.uniques.to_array())
+ self.uniques = uniques
+ labels = self.table.get_labels(values, self.uniques,
+ self.count, na_sentinel,
+ na_value=na_value, mask=mask)
+ self.count = len(self.uniques)
+ return labels
+
+{{endfor}}
+
+
+cdef class StringHashTable(HashTable):
+ # these by-definition *must* be strings
+ # or a sentinel np.nan / None missing value
+ na_string_sentinel = '__nan__'
+
+ def __init__(self, int64_t size_hint=1):
+ self.table = kh_init_str()
+ size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+ kh_resize_str(self.table, size_hint)
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_str(self.table)
+ self.table = NULL
+
+ def sizeof(self, deep: bool = False) -> int:
+ overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
+ for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
+ for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
+ sizeof(Py_ssize_t)) # vals
+ return overhead + for_flags + for_pairs
+
+ def get_state(self) -> dict[str, int]:
+ """ returns infos about the state of the hashtable"""
+ return {
+ 'n_buckets' : self.table.n_buckets,
+ 'size' : self.table.size,
+ 'n_occupied' : self.table.n_occupied,
+ 'upper_bound' : self.table.upper_bound,
+ }
+
+ cpdef get_item(self, str val):
+ cdef:
+ khiter_t k
+ const char *v
+ v = get_c_string(val)
+
+ k = kh_get_str(self.table, v)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, str key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+ const char *v
+
+ v = get_c_string(key)
+
+ k = kh_put_str(self.table, v, &ret)
+ if kh_exist_str(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ @cython.boundscheck(False)
+ def get_indexer(self, ndarray[object] values) -> ndarray:
+ # -> np.ndarray[np.intp]
+ cdef:
+ Py_ssize_t i, n = len(values)
+ ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
+ intp_t *resbuf = <intp_t*>labels.data
+ khiter_t k
+ kh_str_t *table = self.table
+ const char *v
+ const char **vecs
+
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+ v = get_c_string(val)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ k = kh_get_str(table, vecs[i])
+ if k != table.n_buckets:
+ resbuf[i] = table.vals[k]
+ else:
+ resbuf[i] = -1
+
+ free(vecs)
+ return labels
+
+ @cython.boundscheck(False)
+ def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
+ # -> np.ndarray[np.intp]
+ # mask not yet implemented
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ const char *v
+ khiter_t k
+ intp_t[::1] locs = np.empty(n, dtype=np.intp)
+
+ # these by-definition *must* be strings
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if isinstance(val, str):
+ # GH#31499 if we have a np.str_ get_c_string won't recognize
+ # it as a str, even though isinstance does.
+ v = get_c_string(<str>val)
+ else:
+ v = get_c_string(self.na_string_sentinel)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ v = vecs[i]
+ k = kh_get_str(self.table, v)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ free(vecs)
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ def map_locations(self, ndarray[object] values, object mask = None) -> None:
+ # mask not yet implemented
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ const char *v
+ const char **vecs
+ khiter_t k
+
+ # these by-definition *must* be strings
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if isinstance(val, str):
+ # GH#31499 if we have a np.str_ get_c_string won't recognize
+ # it as a str, even though isinstance does.
+ v = get_c_string(<str>val)
+ else:
+ v = get_c_string(self.na_string_sentinel)
+ vecs[i] = v
+
+ with nogil:
+ for i in range(n):
+ v = vecs[i]
+ k = kh_put_str(self.table, v, &ret)
+ self.table.vals[k] = i
+ free(vecs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ uniques : ObjectVector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then any value
+ that is not a string is considered missing. If na_value is
+ not None, then _additionally_ any value "val" satisfying
+ val == na_value is considered missing.
+ ignore_na : bool, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : bool, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ intp_t[::1] labels
+ int64_t[::1] uindexer
+ int ret = 0
+ object val
+ const char *v
+ const char **vecs
+ khiter_t k
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.zeros(n, dtype=np.intp)
+ uindexer = np.empty(n, dtype=np.int64)
+ use_na_value = na_value is not None
+
+ # assign pointers and pre-filter out missing (if ignore_na)
+ vecs = <const char **>malloc(n * sizeof(char *))
+ for i in range(n):
+ val = values[i]
+
+ if (ignore_na
+ and (not isinstance(val, str)
+ or (use_na_value and val == na_value))):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), we can skip the actual value, and
+ # replace the label with na_sentinel directly
+ labels[i] = na_sentinel
+ else:
+ # if ignore_na is False, we also stringify NaN/None/etc.
+ try:
+ v = get_c_string(<str>val)
+ except UnicodeEncodeError:
+ v = get_c_string(<str>repr(val))
+ vecs[i] = v
+
+ # compute
+ with nogil:
+ for i in range(n):
+ if ignore_na and labels[i] == na_sentinel:
+ # skip entries for ignored missing values (see above)
+ continue
+
+ v = vecs[i]
+ k = kh_get_str(self.table, v)
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_str(self.table, v, &ret)
+ uindexer[count] = i
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ free(vecs)
+
+ # uniques
+ for i in range(count):
+ uniques.append(values[uindexer[i]])
+
+ if return_inverse:
+ return uniques.to_array(), labels.base # .base -> underlying ndarray
+ return uniques.to_array()
+
+ def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ return_inverse : bool, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+ mask : ndarray[bool], optional
+ Not yet implemented for StringHashTable
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = ObjectVector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None, object mask=None, ignore_na=True):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then any value
+ that is not a string is considered missing. If na_value is
+ not None, then _additionally_ any value "val" satisfying
+ val == na_value is considered missing.
+ mask : ndarray[bool], optional
+ Not yet implemented for StringHashTable.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[intp]
+ The labels from values to uniques
+ """
+ uniques_vector = ObjectVector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=ignore_na,
+ return_inverse=True)
+
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ # -> np.ndarray[np.intp]
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
+
+
+cdef class PyObjectHashTable(HashTable):
+
+ def __init__(self, int64_t size_hint=1):
+ self.table = kh_init_pymap()
+ size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+ kh_resize_pymap(self.table, size_hint)
+
+ def __dealloc__(self):
+ if self.table is not NULL:
+ kh_destroy_pymap(self.table)
+ self.table = NULL
+
+ def __len__(self) -> int:
+ return self.table.size
+
+ def __contains__(self, object key) -> bool:
+ cdef:
+ khiter_t k
+ hash(key)
+
+ k = kh_get_pymap(self.table, <PyObject*>key)
+ return k != self.table.n_buckets
+
+ def sizeof(self, deep: bool = False) -> int:
+ """ return the size of my table in bytes """
+ overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
+ for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
+ for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
+ sizeof(Py_ssize_t)) # vals
+ return overhead + for_flags + for_pairs
+
+ def get_state(self) -> dict[str, int]:
+ """
+ returns infos about the current state of the hashtable like size,
+ number of buckets and so on.
+ """
+ return {
+ 'n_buckets' : self.table.n_buckets,
+ 'size' : self.table.size,
+ 'n_occupied' : self.table.n_occupied,
+ 'upper_bound' : self.table.upper_bound,
+ }
+
+ cpdef get_item(self, object val):
+ cdef:
+ khiter_t k
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k != self.table.n_buckets:
+ return self.table.vals[k]
+ else:
+ raise KeyError(val)
+
+ cpdef set_item(self, object key, Py_ssize_t val):
+ cdef:
+ khiter_t k
+ int ret = 0
+ char* buf
+
+ hash(key)
+
+ k = kh_put_pymap(self.table, <PyObject*>key, &ret)
+ if kh_exist_pymap(self.table, k):
+ self.table.vals[k] = val
+ else:
+ raise KeyError(key)
+
+ def map_locations(self, ndarray[object] values, object mask = None) -> None:
+ # mask not yet implemented
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ khiter_t k
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+ self.table.vals[k] = i
+
+ def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
+ # -> np.ndarray[np.intp]
+ # mask not yet implemented
+ cdef:
+ Py_ssize_t i, n = len(values)
+ int ret = 0
+ object val
+ khiter_t k
+ intp_t[::1] locs = np.empty(n, dtype=np.intp)
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k != self.table.n_buckets:
+ locs[i] = self.table.vals[k]
+ else:
+ locs[i] = -1
+
+ return np.asarray(locs)
+
+ @cython.boundscheck(False)
+ @cython.wraparound(False)
+ def _unique(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None, bint ignore_na=False,
+ bint return_inverse=False):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ uniques : ObjectVector
+ Vector into which uniques will be written
+ count_prior : Py_ssize_t, default 0
+ Number of existing entries in uniques
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then None _plus_
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ ignore_na : bool, default False
+ Whether NA-values should be ignored for calculating the uniques. If
+ True, the labels corresponding to missing values will be set to
+ na_sentinel.
+ return_inverse : bool, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t] (if return_inverse=True)
+ The labels from values to uniques
+ """
+ cdef:
+ Py_ssize_t i, idx, count = count_prior, n = len(values)
+ intp_t[::1] labels
+ int ret = 0
+ object val
+ khiter_t k
+ bint use_na_value
+
+ if return_inverse:
+ labels = np.empty(n, dtype=np.intp)
+ use_na_value = na_value is not None
+
+ for i in range(n):
+ val = values[i]
+ hash(val)
+
+ if ignore_na and (
+ checknull(val)
+ or (use_na_value and val == na_value)
+ ):
+ # if missing values do not count as unique values (i.e. if
+ # ignore_na is True), skip the hashtable entry for them, and
+ # replace the corresponding label with na_sentinel
+ labels[i] = na_sentinel
+ continue
+
+ k = kh_get_pymap(self.table, <PyObject*>val)
+ if k == self.table.n_buckets:
+ # k hasn't been seen yet
+ k = kh_put_pymap(self.table, <PyObject*>val, &ret)
+ uniques.append(val)
+ if return_inverse:
+ self.table.vals[k] = count
+ labels[i] = count
+ count += 1
+ elif return_inverse:
+ # k falls into a previous bucket
+ # only relevant in case we need to construct the inverse
+ idx = self.table.vals[k]
+ labels[i] = idx
+
+ if return_inverse:
+ return uniques.to_array(), labels.base # .base -> underlying ndarray
+ return uniques.to_array()
+
+ def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ return_inverse : bool, default False
+ Whether the mapping of the original array values to their location
+ in the vector of uniques should be returned.
+ mask : ndarray[bool], optional
+ Not yet implemented for PyObjectHashTable
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t] (if return_inverse)
+ The labels from values to uniques
+ """
+ uniques = ObjectVector()
+ return self._unique(values, uniques, ignore_na=False,
+ return_inverse=return_inverse)
+
+ def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+ object na_value=None, object mask=None, ignore_na=True):
+ """
+ Calculate unique values and labels (no sorting!)
+
+ Missing values are not included in the "uniques" for this method.
+ The labels for any missing values will be set to "na_sentinel"
+
+ Parameters
+ ----------
+ values : ndarray[object]
+ Array of values of which unique will be calculated
+ na_sentinel : Py_ssize_t, default -1
+ Sentinel value used for all NA-values in inverse
+ na_value : object, default None
+ Value to identify as missing. If na_value is None, then None _plus_
+ any value "val" satisfying val != val is considered missing.
+ If na_value is not None, then _additionally_, any value "val"
+ satisfying val == na_value is considered missing.
+ mask : ndarray[bool], optional
+ Not yet implemented for PyObjectHashTable.
+
+ Returns
+ -------
+ uniques : ndarray[object]
+ Unique values of input, not sorted
+ labels : ndarray[intp_t]
+ The labels from values to uniques
+ """
+ uniques_vector = ObjectVector()
+ return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+ na_value=na_value, ignore_na=ignore_na,
+ return_inverse=True)
+
+ def get_labels(self, ndarray[object] values, ObjectVector uniques,
+ Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+ object na_value=None):
+ # -> np.ndarray[np.intp]
+ _, labels = self._unique(values, uniques, count_prior=count_prior,
+ na_sentinel=na_sentinel, na_value=na_value,
+ ignore_na=True, return_inverse=True)
+ return labels
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in
new file mode 100644
index 0000000000..b9cf601148
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in
@@ -0,0 +1,484 @@
+"""
+Template for each `dtype` helper function for hashtable
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+{{py:
+
+# name, dtype, ttype, c_type, to_c_type
+dtypes = [('Complex128', 'complex128', 'complex128',
+ 'khcomplex128_t', 'to_khcomplex128_t'),
+ ('Complex64', 'complex64', 'complex64',
+ 'khcomplex64_t', 'to_khcomplex64_t'),
+ ('Float64', 'float64', 'float64', 'float64_t', ''),
+ ('Float32', 'float32', 'float32', 'float32_t', ''),
+ ('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
+ ('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
+ ('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
+ ('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
+ ('Object', 'object', 'pymap', 'object', '<PyObject*>'),
+ ('Int64', 'int64', 'int64', 'int64_t', ''),
+ ('Int32', 'int32', 'int32', 'int32_t', ''),
+ ('Int16', 'int16', 'int16', 'int16_t', ''),
+ ('Int8', 'int8', 'int8', 'int8_t', '')]
+
+}}
+
+{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+{{if dtype == 'object'}}
+cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
+{{else}}
+cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
+{{endif}}
+ cdef:
+ Py_ssize_t i = 0
+ Py_ssize_t n = len(values)
+ kh_{{ttype}}_t *table
+
+ # Don't use Py_ssize_t, since table.n_buckets is unsigned
+ khiter_t k
+
+ {{c_type}} val
+
+ int ret = 0
+ bint uses_mask = mask is not None
+ bint isna_entry = False
+
+ if uses_mask and not dropna:
+ raise NotImplementedError("uses_mask not implemented with dropna=False")
+
+ # we track the order in which keys are first seen (GH39009),
+ # khash-map isn't insertion-ordered, thus:
+ # table maps keys to counts
+ # result_keys remembers the original order of keys
+
+ result_keys = {{name}}Vector()
+ table = kh_init_{{ttype}}()
+
+ {{if dtype == 'object'}}
+ if uses_mask:
+ raise NotImplementedError("uses_mask not implemented with object dtype")
+
+ kh_resize_{{ttype}}(table, n // 10)
+
+ for i in range(n):
+ val = values[i]
+ if not dropna or not checknull(val):
+ k = kh_get_{{ttype}}(table, {{to_c_type}}val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
+ table.vals[k] = 1
+ result_keys.append(val)
+ {{else}}
+ kh_resize_{{ttype}}(table, n)
+
+ for i in range(n):
+ val = {{to_c_type}}(values[i])
+
+ if dropna:
+ if uses_mask:
+ isna_entry = mask[i]
+ else:
+ isna_entry = is_nan_{{c_type}}(val)
+
+ if not dropna or not isna_entry:
+ k = kh_get_{{ttype}}(table, val)
+ if k != table.n_buckets:
+ table.vals[k] += 1
+ else:
+ k = kh_put_{{ttype}}(table, val, &ret)
+ table.vals[k] = 1
+ result_keys.append(val)
+ {{endif}}
+
+ # collect counts in the order corresponding to result_keys:
+ cdef:
+ int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
+
+ for i in range(table.size):
+ {{if dtype == 'object'}}
+ k = kh_get_{{ttype}}(table, result_keys.data[i])
+ {{else}}
+ k = kh_get_{{ttype}}(table, result_keys.data.data[i])
+ {{endif}}
+ result_counts[i] = table.vals[k]
+
+ kh_destroy_{{ttype}}(table)
+
+ return result_keys.to_array(), result_counts.base
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+{{if dtype == 'object'}}
+cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
+{{else}}
+cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
+{{endif}}
+ cdef:
+ int ret = 0
+ {{if dtype != 'object'}}
+ {{c_type}} value
+ {{else}}
+ PyObject* value
+ {{endif}}
+ Py_ssize_t i, n = len(values), first_na = -1
+ khiter_t k
+ kh_{{ttype}}_t *table = kh_init_{{ttype}}()
+ ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+ bint seen_na = False, uses_mask = mask is not None
+ bint seen_multiple_na = False
+
+ kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
+
+ if keep not in ('last', 'first', False):
+ raise ValueError('keep must be either "first", "last" or False')
+
+ {{for cond, keep in [('if', '"last"'), ('elif', '"first"')]}}
+ {{cond}} keep == {{keep}}:
+ {{if dtype == 'object'}}
+ if True:
+ {{else}}
+ with nogil:
+ {{endif}}
+ {{if keep == '"last"'}}
+ for i in range(n - 1, -1, -1):
+ {{else}}
+ for i in range(n):
+ {{endif}}
+ if uses_mask and mask[i]:
+ if seen_na:
+ out[i] = True
+ else:
+ out[i] = False
+ seen_na = True
+ else:
+ value = {{to_c_type}}(values[i])
+ kh_put_{{ttype}}(table, value, &ret)
+ out[i] = ret == 0
+ {{endfor}}
+
+ else:
+ {{if dtype == 'object'}}
+ if True:
+ {{else}}
+ with nogil:
+ {{endif}}
+ for i in range(n):
+ if uses_mask and mask[i]:
+ if not seen_na:
+ first_na = i
+ seen_na = True
+ out[i] = 0
+ elif not seen_multiple_na:
+ out[i] = 1
+ out[first_na] = 1
+ seen_multiple_na = True
+ else:
+ out[i] = 1
+
+ else:
+ value = {{to_c_type}}(values[i])
+ k = kh_get_{{ttype}}(table, value)
+ if k != table.n_buckets:
+ out[table.vals[k]] = 1
+ out[i] = 1
+ else:
+ k = kh_put_{{ttype}}(table, value, &ret)
+ table.vals[k] = i
+ out[i] = 0
+
+ kh_destroy_{{ttype}}(table)
+ return out
+
+
+# ----------------------------------------------------------------------
+# Membership
+# ----------------------------------------------------------------------
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+{{if dtype == 'object'}}
+cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
+{{else}}
+cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
+{{endif}}
+ """
+ Return boolean of values in arr on an
+ element by-element basis
+
+ Parameters
+ ----------
+ arr : {{dtype}} ndarray
+ values : {{dtype}} ndarray
+
+ Returns
+ -------
+ boolean ndarray len of (arr)
+ """
+ cdef:
+ Py_ssize_t i, n
+ khiter_t k
+ int ret = 0
+ ndarray[uint8_t] result
+
+ {{if dtype == "object"}}
+ PyObject* val
+ {{else}}
+ {{c_type}} val
+ {{endif}}
+
+ kh_{{ttype}}_t *table = kh_init_{{ttype}}()
+
+ # construct the table
+ n = len(values)
+ kh_resize_{{ttype}}(table, n)
+
+ {{if dtype == 'object'}}
+ if True:
+ {{else}}
+ with nogil:
+ {{endif}}
+ for i in range(n):
+ val = {{to_c_type}}(values[i])
+ kh_put_{{ttype}}(table, val, &ret)
+
+ # test membership
+ n = len(arr)
+ result = np.empty(n, dtype=np.uint8)
+
+ {{if dtype == 'object'}}
+ if True:
+ {{else}}
+ with nogil:
+ {{endif}}
+ for i in range(n):
+ val = {{to_c_type}}(arr[i])
+ k = kh_get_{{ttype}}(table, val)
+ result[i] = (k != table.n_buckets)
+
+ kh_destroy_{{ttype}}(table)
+ return result.view(np.bool_)
+
+# ----------------------------------------------------------------------
+# Mode Computations
+# ----------------------------------------------------------------------
+
+{{endfor}}
+
+
+ctypedef fused htfunc_t:
+ numeric_object_t
+ complex128_t
+ complex64_t
+
+
+cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
+ if htfunc_t is object:
+ return value_count_object(values, dropna, mask=mask)
+
+ elif htfunc_t is int8_t:
+ return value_count_int8(values, dropna, mask=mask)
+ elif htfunc_t is int16_t:
+ return value_count_int16(values, dropna, mask=mask)
+ elif htfunc_t is int32_t:
+ return value_count_int32(values, dropna, mask=mask)
+ elif htfunc_t is int64_t:
+ return value_count_int64(values, dropna, mask=mask)
+
+ elif htfunc_t is uint8_t:
+ return value_count_uint8(values, dropna, mask=mask)
+ elif htfunc_t is uint16_t:
+ return value_count_uint16(values, dropna, mask=mask)
+ elif htfunc_t is uint32_t:
+ return value_count_uint32(values, dropna, mask=mask)
+ elif htfunc_t is uint64_t:
+ return value_count_uint64(values, dropna, mask=mask)
+
+ elif htfunc_t is float64_t:
+ return value_count_float64(values, dropna, mask=mask)
+ elif htfunc_t is float32_t:
+ return value_count_float32(values, dropna, mask=mask)
+
+ elif htfunc_t is complex128_t:
+ return value_count_complex128(values, dropna, mask=mask)
+ elif htfunc_t is complex64_t:
+ return value_count_complex64(values, dropna, mask=mask)
+
+ else:
+ raise TypeError(values.dtype)
+
+
+cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None):
+ if htfunc_t is object:
+ return duplicated_object(values, keep, mask=mask)
+
+ elif htfunc_t is int8_t:
+ return duplicated_int8(values, keep, mask=mask)
+ elif htfunc_t is int16_t:
+ return duplicated_int16(values, keep, mask=mask)
+ elif htfunc_t is int32_t:
+ return duplicated_int32(values, keep, mask=mask)
+ elif htfunc_t is int64_t:
+ return duplicated_int64(values, keep, mask=mask)
+
+ elif htfunc_t is uint8_t:
+ return duplicated_uint8(values, keep, mask=mask)
+ elif htfunc_t is uint16_t:
+ return duplicated_uint16(values, keep, mask=mask)
+ elif htfunc_t is uint32_t:
+ return duplicated_uint32(values, keep, mask=mask)
+ elif htfunc_t is uint64_t:
+ return duplicated_uint64(values, keep, mask=mask)
+
+ elif htfunc_t is float64_t:
+ return duplicated_float64(values, keep, mask=mask)
+ elif htfunc_t is float32_t:
+ return duplicated_float32(values, keep, mask=mask)
+
+ elif htfunc_t is complex128_t:
+ return duplicated_complex128(values, keep, mask=mask)
+ elif htfunc_t is complex64_t:
+ return duplicated_complex64(values, keep, mask=mask)
+
+ else:
+ raise TypeError(values.dtype)
+
+
+cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
+ if htfunc_t is object:
+ return ismember_object(arr, values)
+
+ elif htfunc_t is int8_t:
+ return ismember_int8(arr, values)
+ elif htfunc_t is int16_t:
+ return ismember_int16(arr, values)
+ elif htfunc_t is int32_t:
+ return ismember_int32(arr, values)
+ elif htfunc_t is int64_t:
+ return ismember_int64(arr, values)
+
+ elif htfunc_t is uint8_t:
+ return ismember_uint8(arr, values)
+ elif htfunc_t is uint16_t:
+ return ismember_uint16(arr, values)
+ elif htfunc_t is uint32_t:
+ return ismember_uint32(arr, values)
+ elif htfunc_t is uint64_t:
+ return ismember_uint64(arr, values)
+
+ elif htfunc_t is float64_t:
+ return ismember_float64(arr, values)
+ elif htfunc_t is float32_t:
+ return ismember_float32(arr, values)
+
+ elif htfunc_t is complex128_t:
+ return ismember_complex128(arr, values)
+ elif htfunc_t is complex64_t:
+ return ismember_complex64(arr, values)
+
+ else:
+ raise TypeError(values.dtype)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
+ # TODO(cython3): use const htfunct_t[:]
+
+ cdef:
+ ndarray[htfunc_t] keys
+ ndarray[htfunc_t] modes
+
+ int64_t[::1] counts
+ int64_t count, max_count = -1
+ Py_ssize_t nkeys, k, j = 0
+
+ keys, counts = value_count(values, dropna, mask=mask)
+ nkeys = len(keys)
+
+ modes = np.empty(nkeys, dtype=values.dtype)
+
+ if htfunc_t is not object:
+ with nogil:
+ for k in range(nkeys):
+ count = counts[k]
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = keys[k]
+ else:
+ for k in range(nkeys):
+ count = counts[k]
+ if count == max_count:
+ j += 1
+ elif count > max_count:
+ max_count = count
+ j = 0
+ else:
+ continue
+
+ modes[j] = keys[k]
+
+ return modes[:j + 1]
+
+
+{{py:
+
+# name, dtype, ttype, c_type
+dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
+ ('Int32', 'int32', 'int32', 'int32_t'), ]
+
+}}
+
+{{for name, dtype, ttype, c_type in dtypes}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
+ """
+ Indices of the first occurrences of the unique labels
+ *excluding* -1. equivalent to:
+ np.unique(labels, return_index=True)[1]
+ """
+ cdef:
+ int ret = 0
+ Py_ssize_t i, n = len(labels)
+ kh_{{ttype}}_t *table = kh_init_{{ttype}}()
+ {{name}}Vector idx = {{name}}Vector()
+ ndarray[{{c_type}}, ndim=1] arr
+ {{name}}VectorData *ud = idx.data
+
+ kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
+
+ with nogil:
+ for i in range(n):
+ kh_put_{{ttype}}(table, labels[i], &ret)
+ if ret != 0:
+ if needs_resize(ud):
+ with gil:
+ idx.resize()
+ append_data_{{ttype}}(ud, i)
+
+ kh_destroy_{{ttype}}(table)
+
+ arr = idx.to_array()
+ arr = arr[np.asarray(labels)[arr].argsort()]
+
+ return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
+
+{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in
new file mode 100644
index 0000000000..bf3d88edd9
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in
@@ -0,0 +1,78 @@
+"""
+Template for functions of IndexEngine subclasses.
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# IndexEngine Subclass Methods
+# ----------------------------------------------------------------------
+
+{{py:
+
+# name, dtype
+dtypes = [('Float64', 'float64'),
+ ('Float32', 'float32'),
+ ('Int64', 'int64'),
+ ('Int32', 'int32'),
+ ('Int16', 'int16'),
+ ('Int8', 'int8'),
+ ('UInt64', 'uint64'),
+ ('UInt32', 'uint32'),
+ ('UInt16', 'uint16'),
+ ('UInt8', 'uint8'),
+ ('Complex64', 'complex64'),
+ ('Complex128', 'complex128'),
+ ]
+
+engines = [('', 'IndexEngine'), ('Masked', 'MaskedIndexEngine')]
+
+}}
+
+{{for name, dtype in dtypes}}
+
+{{for prefix, engine in engines}}
+
+cdef class {{prefix}}{{name}}Engine({{engine}}):
+
+ cdef _make_hash_table(self, Py_ssize_t n):
+ {{if engine == 'MaskedIndexEngine'}}
+ return _hash.{{name}}HashTable(n, uses_mask=True)
+ {{else}}
+ return _hash.{{name}}HashTable(n)
+ {{endif}}
+
+ cdef _check_type(self, object val):
+ {{if engine == 'MaskedIndexEngine'}}
+ if val is C_NA:
+ return val
+ {{endif}}
+ {{if name not in {'Float64', 'Float32', 'Complex64', 'Complex128'} }}
+ if not util.is_integer_object(val):
+ if util.is_float_object(val):
+ # Make sure Int64Index.get_loc(2.0) works
+ if val.is_integer():
+ return int(val)
+ raise KeyError(val)
+ {{if name.startswith("U")}}
+ if val < 0:
+ # cannot have negative values with unsigned int dtype
+ raise KeyError(val)
+ {{endif}}
+ {{elif name not in {'Complex64', 'Complex128'} }}
+ if not util.is_integer_object(val) and not util.is_float_object(val):
+ # in particular catch bool and avoid casting True -> 1.0
+ raise KeyError(val)
+ {{else}}
+ if (not util.is_integer_object(val)
+ and not util.is_float_object(val)
+ and not util.is_complex_object(val)
+ ):
+ # in particular catch bool and avoid casting True -> 1.0
+ raise KeyError(val)
+ {{endif}}
+ return val
+
+{{endfor}}
+
+{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in b/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in
new file mode 100644
index 0000000000..67fee7c5fb
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in
@@ -0,0 +1,434 @@
+"""
+Template for intervaltree
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+from pandas._libs.algos import is_monotonic
+
+ctypedef fused int_scalar_t:
+ int64_t
+ float64_t
+
+ctypedef fused uint_scalar_t:
+ uint64_t
+ float64_t
+
+ctypedef fused scalar_t:
+ int_scalar_t
+ uint_scalar_t
+
+# ----------------------------------------------------------------------
+# IntervalTree
+# ----------------------------------------------------------------------
+
+cdef class IntervalTree(IntervalMixin):
+ """A centered interval tree
+
+ Based off the algorithm described on Wikipedia:
+ https://en.wikipedia.org/wiki/Interval_tree
+
+ we are emulating the IndexEngine interface
+ """
+ cdef readonly:
+ ndarray left, right
+ IntervalNode root
+ object dtype
+ str closed
+ object _is_overlapping, _left_sorter, _right_sorter
+ Py_ssize_t _na_count
+
+ def __init__(self, left, right, closed='right', leaf_size=100):
+ """
+ Parameters
+ ----------
+ left, right : np.ndarray[ndim=1]
+ Left and right bounds for each interval. Assumed to contain no
+ NaNs.
+ closed : {'left', 'right', 'both', 'neither'}, optional
+ Whether the intervals are closed on the left-side, right-side, both
+ or neither. Defaults to 'right'.
+ leaf_size : int, optional
+ Parameter that controls when the tree switches from creating nodes
+ to brute-force search. Tune this parameter to optimize query
+ performance.
+ """
+ if closed not in ['left', 'right', 'both', 'neither']:
+ raise ValueError("invalid option for 'closed': %s" % closed)
+
+ left = np.asarray(left)
+ right = np.asarray(right)
+ self.dtype = np.result_type(left, right)
+ self.left = np.asarray(left, dtype=self.dtype)
+ self.right = np.asarray(right, dtype=self.dtype)
+
+ indices = np.arange(len(left), dtype='int64')
+
+ self.closed = closed
+
+ # GH 23352: ensure no nan in nodes
+ mask = ~np.isnan(self.left)
+ self._na_count = len(mask) - mask.sum()
+ self.left = self.left[mask]
+ self.right = self.right[mask]
+ indices = indices[mask]
+
+ node_cls = NODE_CLASSES[str(self.dtype), closed]
+ self.root = node_cls(self.left, self.right, indices, leaf_size)
+
+ @property
+ def left_sorter(self) -> np.ndarray:
+ """How to sort the left labels; this is used for binary search
+ """
+ if self._left_sorter is None:
+ values = [self.right, self.left]
+ self._left_sorter = np.lexsort(values)
+ return self._left_sorter
+
+ @property
+ def right_sorter(self) -> np.ndarray:
+ """How to sort the right labels
+ """
+ if self._right_sorter is None:
+ self._right_sorter = np.argsort(self.right)
+ return self._right_sorter
+
+ @property
+ def is_overlapping(self) -> bool:
+ """
+ Determine if the IntervalTree contains overlapping intervals.
+ Cached as self._is_overlapping.
+ """
+ if self._is_overlapping is not None:
+ return self._is_overlapping
+
+ # <= when both sides closed since endpoints can overlap
+ op = le if self.closed == 'both' else lt
+
+ # overlap if start of current interval < end of previous interval
+ # (current and previous in terms of sorted order by left/start side)
+ current = self.left[self.left_sorter[1:]]
+ previous = self.right[self.left_sorter[:-1]]
+ self._is_overlapping = bool(op(current, previous).any())
+
+ return self._is_overlapping
+
+ @property
+ def is_monotonic_increasing(self) -> bool:
+ """
+ Return True if the IntervalTree is monotonic increasing (only equal or
+ increasing values), else False
+ """
+ if self._na_count > 0:
+ return False
+
+ sort_order = self.left_sorter
+ return is_monotonic(sort_order, False)[0]
+
+ def get_indexer(self, scalar_t[:] target) -> np.ndarray:
+ """Return the positions corresponding to unique intervals that overlap
+ with the given array of scalar targets.
+ """
+
+ # TODO: write get_indexer_intervals
+ cdef:
+ Py_ssize_t old_len
+ Py_ssize_t i
+ Int64Vector result
+
+ result = Int64Vector()
+ old_len = 0
+ for i in range(len(target)):
+ try:
+ self.root.query(result, target[i])
+ except OverflowError:
+ # overflow -> no match, which is already handled below
+ pass
+
+ if result.data.n == old_len:
+ result.append(-1)
+ elif result.data.n > old_len + 1:
+ raise KeyError(
+ 'indexer does not intersect a unique set of intervals')
+ old_len = result.data.n
+ return result.to_array().astype('intp')
+
+ def get_indexer_non_unique(self, scalar_t[:] target):
+ """Return the positions corresponding to intervals that overlap with
+ the given array of scalar targets. Non-unique positions are repeated.
+ """
+ cdef:
+ Py_ssize_t old_len
+ Py_ssize_t i
+ Int64Vector result, missing
+
+ result = Int64Vector()
+ missing = Int64Vector()
+ old_len = 0
+ for i in range(len(target)):
+ try:
+ self.root.query(result, target[i])
+ except OverflowError:
+ # overflow -> no match, which is already handled below
+ pass
+
+ if result.data.n == old_len:
+ result.append(-1)
+ missing.append(i)
+ old_len = result.data.n
+ return (result.to_array().astype('intp'),
+ missing.to_array().astype('intp'))
+
+ def __repr__(self) -> str:
+ return ('<IntervalTree[{dtype},{closed}]: '
+ '{n_elements} elements>'.format(
+ dtype=self.dtype, closed=self.closed,
+ n_elements=self.root.n_elements))
+
+ # compat with IndexEngine interface
+ def clear_mapping(self) -> None:
+ pass
+
+
+cdef take(ndarray source, ndarray indices):
+ """Take the given positions from a 1D ndarray
+ """
+ return PyArray_Take(source, indices, 0)
+
+
+cdef sort_values_and_indices(all_values, all_indices, subset):
+ indices = take(all_indices, subset)
+ values = take(all_values, subset)
+ sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT)
+ sorted_values = take(values, sorter)
+ sorted_indices = take(indices, sorter)
+ return sorted_values, sorted_indices
+
+
+# ----------------------------------------------------------------------
+# Nodes
+# ----------------------------------------------------------------------
+
+@cython.internal
+cdef class IntervalNode:
+ cdef readonly:
+ int64_t n_elements, n_center, leaf_size
+ bint is_leaf_node
+
+ def __repr__(self) -> str:
+ if self.is_leaf_node:
+ return (
+ f"<{type(self).__name__}: {self.n_elements} elements (terminal)>"
+ )
+ else:
+ n_left = self.left_node.n_elements
+ n_right = self.right_node.n_elements
+ n_center = self.n_elements - n_left - n_right
+ return (
+ f"<{type(self).__name__}: "
+ f"pivot {self.pivot}, {self.n_elements} elements "
+ f"({n_left} left, {n_right} right, {n_center} overlapping)>"
+ )
+
+ def counts(self):
+ """
+ Inspect counts on this node
+ useful for debugging purposes
+ """
+ if self.is_leaf_node:
+ return self.n_elements
+ else:
+ m = len(self.center_left_values)
+ l = self.left_node.counts()
+ r = self.right_node.counts()
+ return (m, (l, r))
+
+
+# we need specialized nodes and leaves to optimize for different dtype and
+# closed values
+
+{{py:
+
+nodes = []
+for dtype in ['float64', 'int64', 'uint64']:
+ for closed, cmp_left, cmp_right in [
+ ('left', '<=', '<'),
+ ('right', '<', '<='),
+ ('both', '<=', '<='),
+ ('neither', '<', '<')]:
+ cmp_left_converse = '<' if cmp_left == '<=' else '<='
+ cmp_right_converse = '<' if cmp_right == '<=' else '<='
+ if dtype.startswith('int'):
+ fused_prefix = 'int_'
+ elif dtype.startswith('uint'):
+ fused_prefix = 'uint_'
+ elif dtype.startswith('float'):
+ fused_prefix = ''
+ nodes.append((dtype, dtype.title(),
+ closed, closed.title(),
+ cmp_left,
+ cmp_right,
+ cmp_left_converse,
+ cmp_right_converse,
+ fused_prefix))
+
+}}
+
+NODE_CLASSES = {}
+
+{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right,
+ cmp_left_converse, cmp_right_converse, fused_prefix in nodes}}
+
+
+@cython.internal
+cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode(IntervalNode):
+ """Non-terminal node for an IntervalTree
+
+ Categorizes intervals by those that fall to the left, those that fall to
+ the right, and those that overlap with the pivot.
+ """
+ cdef readonly:
+ {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node
+ {{dtype}}_t[:] center_left_values, center_right_values, left, right
+ int64_t[:] center_left_indices, center_right_indices, indices
+ {{dtype}}_t min_left, max_right
+ {{dtype}}_t pivot
+
+ def __init__(self,
+ ndarray[{{dtype}}_t, ndim=1] left,
+ ndarray[{{dtype}}_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ int64_t leaf_size):
+
+ self.n_elements = len(left)
+ self.leaf_size = leaf_size
+
+ # min_left and min_right are used to speed-up query by skipping
+ # query on sub-nodes. If this node has size 0, query is cheap,
+ # so these values don't matter.
+ if left.size > 0:
+ self.min_left = left.min()
+ self.max_right = right.max()
+ else:
+ self.min_left = 0
+ self.max_right = 0
+
+ if self.n_elements <= leaf_size:
+ # make this a terminal (leaf) node
+ self.is_leaf_node = True
+ self.left = left
+ self.right = right
+ self.indices = indices
+ self.n_center = 0
+ else:
+ # calculate a pivot so we can create child nodes
+ self.is_leaf_node = False
+ self.pivot = np.median(left / 2 + right / 2)
+ if np.isinf(self.pivot):
+ self.pivot = cython.cast({{dtype}}_t, 0)
+ if self.pivot > np.max(right):
+ self.pivot = np.max(left)
+ if self.pivot < np.min(left):
+ self.pivot = np.min(right)
+
+ left_set, right_set, center_set = self.classify_intervals(
+ left, right)
+
+ self.left_node = self.new_child_node(left, right,
+ indices, left_set)
+ self.right_node = self.new_child_node(left, right,
+ indices, right_set)
+
+ self.center_left_values, self.center_left_indices = \
+ sort_values_and_indices(left, indices, center_set)
+ self.center_right_values, self.center_right_indices = \
+ sort_values_and_indices(right, indices, center_set)
+ self.n_center = len(self.center_left_indices)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right):
+ """Classify the given intervals based upon whether they fall to the
+ left, right, or overlap with this node's pivot.
+ """
+ cdef:
+ Int64Vector left_ind, right_ind, overlapping_ind
+ Py_ssize_t i
+
+ left_ind = Int64Vector()
+ right_ind = Int64Vector()
+ overlapping_ind = Int64Vector()
+
+ for i in range(self.n_elements):
+ if right[i] {{cmp_right_converse}} self.pivot:
+ left_ind.append(i)
+ elif self.pivot {{cmp_left_converse}} left[i]:
+ right_ind.append(i)
+ else:
+ overlapping_ind.append(i)
+
+ return (left_ind.to_array(),
+ right_ind.to_array(),
+ overlapping_ind.to_array())
+
+ cdef new_child_node(self,
+ ndarray[{{dtype}}_t, ndim=1] left,
+ ndarray[{{dtype}}_t, ndim=1] right,
+ ndarray[int64_t, ndim=1] indices,
+ ndarray[int64_t, ndim=1] subset):
+ """Create a new child node.
+ """
+ left = take(left, subset)
+ right = take(right, subset)
+ indices = take(indices, subset)
+ return {{dtype_title}}Closed{{closed_title}}IntervalNode(
+ left, right, indices, self.leaf_size)
+
+ @cython.wraparound(False)
+ @cython.boundscheck(False)
+ @cython.initializedcheck(False)
+ cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point):
+ """Recursively query this node and its sub-nodes for intervals that
+ overlap with the query point.
+ """
+ cdef:
+ int64_t[:] indices
+ {{dtype}}_t[:] values
+ Py_ssize_t i
+
+ if self.is_leaf_node:
+ # Once we get down to a certain size, it doesn't make sense to
+ # continue the binary tree structure. Instead, we use linear
+ # search.
+ for i in range(self.n_elements):
+ if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]:
+ result.append(self.indices[i])
+ else:
+ # There are child nodes. Based on comparing our query to the pivot,
+ # look at the center values, then go to the relevant child.
+ if point < self.pivot:
+ values = self.center_left_values
+ indices = self.center_left_indices
+ for i in range(self.n_center):
+ if not values[i] {{cmp_left}} point:
+ break
+ result.append(indices[i])
+ if point {{cmp_right}} self.left_node.max_right:
+ self.left_node.query(result, point)
+ elif point > self.pivot:
+ values = self.center_right_values
+ indices = self.center_right_indices
+ for i in range(self.n_center - 1, -1, -1):
+ if not point {{cmp_right}} values[i]:
+ break
+ result.append(indices[i])
+ if self.right_node.min_left {{cmp_left}} point:
+ self.right_node.query(result, point)
+ else:
+ result.extend(self.center_left_indices)
+
+
+NODE_CLASSES['{{dtype}}',
+ '{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode
+
+{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in
new file mode 100644
index 0000000000..d0934b3e0e
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in
@@ -0,0 +1,44 @@
+"""
+Template for wrapping khash-tables for each primitive `dtype`
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+{{py:
+
+# name, c_type
+primitive_types = [('int64', 'int64_t'),
+ ('uint64', 'uint64_t'),
+ ('float64', 'float64_t'),
+ ('int32', 'int32_t'),
+ ('uint32', 'uint32_t'),
+ ('float32', 'float32_t'),
+ ('int16', 'int16_t'),
+ ('uint16', 'uint16_t'),
+ ('int8', 'int8_t'),
+ ('uint8', 'uint8_t'),
+ ('complex64', 'khcomplex64_t'),
+ ('complex128', 'khcomplex128_t'),
+ ]
+}}
+
+{{for name, c_type in primitive_types}}
+
+cdef extern from "khash_python.h":
+ ctypedef struct kh_{{name}}_t:
+ khuint_t n_buckets, size, n_occupied, upper_bound
+ uint32_t *flags
+ {{c_type}} *keys
+ size_t *vals
+
+ kh_{{name}}_t* kh_init_{{name}}() nogil
+ void kh_destroy_{{name}}(kh_{{name}}_t*) nogil
+ void kh_clear_{{name}}(kh_{{name}}_t*) nogil
+ khuint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil
+ void kh_resize_{{name}}(kh_{{name}}_t*, khuint_t) nogil
+ khuint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil
+ void kh_del_{{name}}(kh_{{name}}_t*, khuint_t) nogil
+
+ bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil
+
+{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in
new file mode 100644
index 0000000000..774a8c579f
--- /dev/null
+++ b/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in
@@ -0,0 +1,313 @@
+"""
+Template for each `dtype` helper function for sparse ops
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+# ----------------------------------------------------------------------
+# Sparse op
+# ----------------------------------------------------------------------
+
+ctypedef fused sparse_t:
+ float64_t
+ int64_t
+
+
+cdef float64_t __div__(sparse_t a, sparse_t b):
+ if b == 0:
+ if a > 0:
+ return INF
+ elif a < 0:
+ return -INF
+ else:
+ return NaN
+ else:
+ return float(a) / b
+
+
+cdef float64_t __truediv__(sparse_t a, sparse_t b):
+ return __div__(a, b)
+
+
+cdef sparse_t __mod__(sparse_t a, sparse_t b):
+ if b == 0:
+ if sparse_t is float64_t:
+ return NaN
+ else:
+ return 0
+ else:
+ return a % b
+
+
+cdef sparse_t __floordiv__(sparse_t a, sparse_t b):
+ if b == 0:
+ if sparse_t is float64_t:
+ # Match non-sparse Series behavior implemented in mask_zero_div_zero
+ if a > 0:
+ return INF
+ elif a < 0:
+ return -INF
+ return NaN
+ else:
+ return 0
+ else:
+ return a // b
+
+
+# ----------------------------------------------------------------------
+# sparse array op
+# ----------------------------------------------------------------------
+
+{{py:
+
+# dtype, arith_comp_group, logical_group
+dtypes = [('float64', True, False),
+ ('int64', True, True),
+ ('uint8', False, True)]
+# do not generate arithmetic / comparison template for uint8,
+# it should be done in fused types
+
+def get_op(tup):
+ assert isinstance(tup, tuple)
+ assert len(tup) == 4
+
+ opname, lval, rval, dtype = tup
+
+ ops_dict = {'add': '{0} + {1}',
+ 'sub': '{0} - {1}',
+ 'mul': '{0} * {1}',
+ 'div': '__div__({0}, {1})',
+ 'mod': '__mod__({0}, {1})',
+ 'truediv': '__truediv__({0}, {1})',
+ 'floordiv': '__floordiv__({0}, {1})',
+ 'pow': '{0} ** {1}',
+ 'eq': '{0} == {1}',
+ 'ne': '{0} != {1}',
+ 'lt': '{0} < {1}',
+ 'gt': '{0} > {1}',
+ 'le': '{0} <= {1}',
+ 'ge': '{0} >= {1}',
+
+ 'and': '{0} & {1}', # logical op
+ 'or': '{0} | {1}',
+ 'xor': '{0} ^ {1}'}
+
+ return ops_dict[opname].format(lval, rval)
+
+
+def get_dispatch(dtypes):
+
+ ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
+ 'floordiv', 'pow',
+ 'eq', 'ne', 'lt', 'gt', 'le', 'ge',
+ 'and', 'or', 'xor']
+
+ for opname in ops_list:
+ for dtype, arith_comp_group, logical_group in dtypes:
+
+ if opname in ('div', 'truediv'):
+ rdtype = 'float64'
+ elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
+ # comparison op
+ rdtype = 'uint8'
+ elif opname in ('and', 'or', 'xor'):
+ # logical op
+ rdtype = 'uint8'
+ else:
+ rdtype = dtype
+
+ if opname in ('and', 'or', 'xor'):
+ if logical_group:
+ yield opname, dtype, rdtype
+ else:
+ if arith_comp_group:
+ yield opname, dtype, rdtype
+
+}}
+
+
+{{for opname, dtype, rdtype in get_dispatch(dtypes)}}
+
+{{if opname == "pow"}}
+@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
+{{endif}}
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef tuple block_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
+ BlockIndex xindex,
+ {{dtype}}_t xfill,
+ {{dtype}}_t[:] y_,
+ BlockIndex yindex,
+ {{dtype}}_t yfill):
+ """
+ Binary operator on BlockIndex objects with fill values
+ """
+
+ cdef:
+ BlockIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xbp = 0, ybp = 0 # block positions
+ int32_t xloc, yloc
+ Py_ssize_t xblock = 0, yblock = 0 # block numbers
+
+ {{dtype}}_t[:] x, y
+ ndarray[{{rdtype}}_t, ndim=1] out
+
+ # to suppress Cython warning
+ x = x_
+ y = y_
+
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
+
+ # Wow, what a hack job. Need to do something about this
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if yblock == yindex.nblocks:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ continue
+
+ if xblock == xindex.nblocks:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+ continue
+
+ yloc = yindex.locbuf[yblock] + ybp
+ xloc = xindex.locbuf[xblock] + xbp
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
+ xi += 1
+ yi += 1
+
+ # advance both locations
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+
+ # advance x location
+ xbp += 1
+ if xbp == xindex.lenbuf[xblock]:
+ xblock += 1
+ xbp = 0
+ else:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+
+ # advance y location
+ ybp += 1
+ if ybp == yindex.lenbuf[yblock]:
+ yblock += 1
+ ybp = 0
+
+ return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
+
+{{if opname == "pow"}}
+@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
+{{endif}}
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef tuple int_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
+ IntIndex xindex,
+ {{dtype}}_t xfill,
+ {{dtype}}_t[:] y_,
+ IntIndex yindex,
+ {{dtype}}_t yfill):
+ cdef:
+ IntIndex out_index
+ Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
+ int32_t xloc, yloc
+ int32_t[:] xindices, yindices, out_indices
+ {{dtype}}_t[:] x, y
+ ndarray[{{rdtype}}_t, ndim=1] out
+
+ # suppress Cython compiler warnings due to inlining
+ x = x_
+ y = y_
+
+ # need to do this first to know size of result array
+ out_index = xindex.make_union(yindex)
+ out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
+
+ xindices = xindex.indices
+ yindices = yindex.indices
+ out_indices = out_index.indices
+
+ # walk the two SparseVectors, adding matched locations...
+ for out_i in range(out_index.npoints):
+ if xi == xindex.npoints:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+ continue
+
+ if yi == yindex.npoints:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+ continue
+
+ xloc = xindices[xi]
+ yloc = yindices[yi]
+
+ # each index in the out_index had to come from either x, y, or both
+ if xloc == yloc:
+ out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
+ xi += 1
+ yi += 1
+ elif xloc < yloc:
+ # use y fill value
+ out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
+ xi += 1
+ else:
+ # use x fill value
+ out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
+ yi += 1
+
+ return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
+
+
+cpdef sparse_{{opname}}_{{dtype}}({{dtype}}_t[:] x,
+ SparseIndex xindex, {{dtype}}_t xfill,
+ {{dtype}}_t[:] y,
+ SparseIndex yindex, {{dtype}}_t yfill):
+
+ if isinstance(xindex, BlockIndex):
+ return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill,
+ y, yindex.to_block_index(), yfill)
+ elif isinstance(xindex, IntIndex):
+ return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill,
+ y, yindex.to_int_index(), yfill)
+ else:
+ raise NotImplementedError
+
+{{endfor}}
diff --git a/contrib/python/pandas/py3/patches/01-arcadia.patch b/contrib/python/pandas/py3/patches/01-arcadia.patch
new file mode 100644
index 0000000000..b228067b37
--- /dev/null
+++ b/contrib/python/pandas/py3/patches/01-arcadia.patch
@@ -0,0 +1,33 @@
+--- contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h (index)
++++ contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h (working tree)
+@@ -23,1 +23,1 @@ typedef npy_complex128 khcomplex128_t;
+-void *traced_malloc(size_t size){
++static void *traced_malloc(size_t size){
+@@ -31,1 +31,1 @@ void *traced_malloc(size_t size){
+-void *traced_calloc(size_t num, size_t size){
++static void *traced_calloc(size_t num, size_t size){
+@@ -39,1 +39,1 @@ void *traced_calloc(size_t num, size_t size){
+-void *traced_realloc(void* old_ptr, size_t size){
++static void *traced_realloc(void* old_ptr, size_t size){
+@@ -50,1 +50,1 @@ void *traced_realloc(void* old_ptr, size_t size){
+-void traced_free(void* ptr){
++static void traced_free(void* ptr){
+--- contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c (index)
++++ contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c (working tree)
+@@ -9,2 +9,2 @@ The full license is in the LICENSE file, distributed with this software.
+-#include <../../../tslibs/src/datetime/np_datetime.h>
+-#include <../../../tslibs/src/datetime/np_datetime_strings.h>
++#include "../../../tslibs/src/datetime/np_datetime.h"
++#include "../../../tslibs/src/datetime/np_datetime_strings.h"
+--- contrib/python/pandas/py3/pandas/compat/_optional.py (index)
++++ contrib/python/pandas/py3/pandas/compat/_optional.py (working tree)
+@@ -41,1 +41,1 @@ VERSIONS = {
+- "sqlalchemy": "1.4.16",
++ "sqlalchemy": "1.2.0",
+--- contrib/python/pandas/py3/ya.make (index)
++++ contrib/python/pandas/py3/ya.make (working tree)
+@@ -44,2 +44,4 @@ CFLAGS(
+
++INCLUDE(symbols.cmake)
++
+ SRCS(